vedant2905 commited on
Commit
e795f9f
·
verified ·
1 Parent(s): 612f59a

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +54 -61
src/streamlit_app.py CHANGED
@@ -2,7 +2,6 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import os
5
- import re
6
  import matplotlib.pyplot as plt
7
  from wordcloud import WordCloud
8
  from collections import Counter, defaultdict
@@ -316,68 +315,62 @@ def main():
316
  token_instances = predictions_df[predictions_df['Token'] == selected_token]
317
 
318
  if not token_instances.empty:
319
- # Create container for token info
320
- with st.container():
321
- # Display token info with styling
322
- st.markdown(f'<h2 class="section-header">Salient Token: <span class="highlight-token">{selected_token}</span></h2>',
323
- unsafe_allow_html=True)
324
-
325
- # Get most frequent cluster (Top 1) for this token
326
- top_cluster = token_instances['Top 1'].value_counts().index[0]
327
-
328
- # Display primary cluster
329
- col1, = st.columns(1)
330
- with col1:
331
- st.metric("Primary Cluster", top_cluster)
332
-
333
- st.markdown("<hr>", unsafe_allow_html=True)
334
-
335
- # Check if token is salient (has position_idx = -1)
336
- is_salient = any(token_instances['position_idx'] == -1)
337
- if is_salient: # If position_idx is -1, this is a salient token
338
- st.subheader("Original Sentence Context")
339
- dev_sentences = load_dev_sentences()
340
- if dev_sentences:
341
- line_numbers = token_instances[token_instances['position_idx'] == -1]['line_idx'].unique()
342
- for line_num in line_numbers:
343
- if 0 <= line_num < len(dev_sentences):
344
- st.code(dev_sentences[line_num], language="java")
345
- else:
346
- st.warning("Could not load sentences from dev.in")
347
- elif not selected_token.startswith("[CLS]"): # Show wordcloud only for non-CLS tokens
348
- # Word cloud visualization for non-salient, non-CLS tokens
349
- unique_tokens = set(token for token, _ in clusters[top_cluster])
350
- st.subheader("Tokens in Predicted Cluster")
351
- if unique_tokens:
352
- fig = create_wordcloud(unique_tokens)
353
- if fig:
354
- st.pyplot(fig)
355
- plt.close(fig)
356
  else:
357
  st.info("No tokens found in this cluster")
358
-
359
- # Only show cluster statistics for non-[CLS] tokens (including numbered ones)
360
- if not selected_token.startswith("[CLS]"):
361
- col1, col2 = st.columns(2)
362
- with col1:
363
- # Show cluster statistics
364
- unique_tokens = len(set(token for token, _ in clusters[top_cluster]))
365
- total_occurrences = len(clusters[top_cluster])
366
- st.metric("Unique Tokens in Cluster", unique_tokens)
367
- with col2:
368
- st.metric("Total Token Occurrences", total_occurrences)
369
-
370
- # Show all contexts from predicted cluster in an expander
371
- with st.expander("👀 View Contexts (from Predicted Cluster)", expanded=False):
372
- cluster_contexts = [(token, line_num) for token, line_num in clusters[top_cluster]
373
- if 0 <= line_num - 1 < len(sentences)]
374
-
375
- if cluster_contexts:
376
- for token, line_num in cluster_contexts:
377
- st.code(f"{sentences[line_num - 1]}", language="python")
378
- else:
379
- st.info("No contexts found in this cluster")
380
-
381
  else:
382
  st.warning(f"No instances found for token: {selected_token}")
383
  else:
 
2
  import pandas as pd
3
  import numpy as np
4
  import os
 
5
  import matplotlib.pyplot as plt
6
  from wordcloud import WordCloud
7
  from collections import Counter, defaultdict
 
315
  token_instances = predictions_df[predictions_df['Token'] == selected_token]
316
 
317
  if not token_instances.empty:
318
+ # Display token info with styling
319
+ st.markdown(f'<h2 class="section-header">Salient Token: <span class="highlight-token">{selected_token}</span></h2>',
320
+ unsafe_allow_html=True)
321
+
322
+ # Get most frequent cluster (Top 1) for this token
323
+ top_cluster = token_instances['Top 1'].value_counts().index[0]
324
+
325
+ # Display primary cluster
326
+ st.metric("Primary Cluster", top_cluster)
327
+
328
+ st.markdown("<hr>", unsafe_allow_html=True)
329
+
330
+ # Check if token is salient (has position_idx = -1)
331
+ is_salient = any(token_instances['position_idx'] == -1)
332
+
333
+ if is_salient: # If position_idx is -1, this is a salient token
334
+ st.subheader("Original Sentence Context")
335
+ dev_sentences = load_dev_sentences()
336
+ if dev_sentences:
337
+ line_numbers = token_instances[token_instances['position_idx'] == -1]['line_idx'].unique()
338
+ for line_num in line_numbers:
339
+ if 0 <= line_num < len(dev_sentences):
340
+ st.code(dev_sentences[line_num], language="java")
341
+ else:
342
+ st.warning("Could not load sentences from dev.in")
343
+ elif not selected_token.startswith("[CLS]"):
344
+ # Word cloud visualization for non-salient, non-CLS tokens
345
+ unique_tokens = set(token for token, _ in clusters[top_cluster])
346
+ st.subheader("Tokens in Predicted Cluster")
347
+ if unique_tokens:
348
+ fig = create_wordcloud(unique_tokens)
349
+ if fig:
350
+ st.pyplot(fig)
351
+ plt.close(fig)
 
 
 
352
  else:
353
  st.info("No tokens found in this cluster")
354
+
355
+ # Only show cluster statistics for non-[CLS] tokens
356
+ if not selected_token.startswith("[CLS]"):
357
+ col1, col2 = st.columns(2)
358
+ with col1:
359
+ unique_tokens = len(set(token for token, _ in clusters[top_cluster]))
360
+ st.metric("Unique Tokens in Cluster", unique_tokens)
361
+ with col2:
362
+ total_occurrences = len(clusters[top_cluster])
363
+ st.metric("Total Token Occurrences", total_occurrences)
364
+
365
+ # Show contexts from predicted cluster in an expander
366
+ with st.expander("👀 View Similar Contexts (from Predicted Cluster)", expanded=False):
367
+ cluster_contexts = [(token, line_num) for token, line_num in clusters[top_cluster]
368
+ if 0 <= line_num - 1 < len(sentences)]
369
+ if cluster_contexts:
370
+ for token, line_num in cluster_contexts:
371
+ st.code(f"{sentences[line_num - 1]}", language="python")
372
+ else:
373
+ st.info("No contexts found in this cluster")
 
 
 
374
  else:
375
  st.warning(f"No instances found for token: {selected_token}")
376
  else: