Spaces:

vedant2905
/

Salient_3_problems

Sleeping

Salient_3_problems / src /streamlit_app.py

Vedant Pungliya

Updating files

5d6df7a unverified 9 months ago

11.6 kB

	import streamlit as st
	import pandas as pd
	import os
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt

	# Initialize session state if needed
	if 'selected_token' not in st.session_state:
	st.session_state.selected_token = None
	if 'selected_task' not in st.session_state:
	st.session_state.selected_task = None
	if 'selected_layer' not in st.session_state:
	st.session_state.selected_layer = None

	def get_available_tasks():
	"""Get list of available tasks based on directory structure."""
	base_path = os.path.join("src", "codebert")
	return [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

	def get_available_layers(task):
	"""Get list of available layers for a task."""
	task_path = os.path.join("src", "codebert", task)
	layers = []
	for item in os.listdir(task_path):
	if item.startswith("layer"):
	try:
	layer_num = int(item.replace("layer", ""))
	layers.append(layer_num)
	except ValueError:
	continue
	return sorted(layers)

	def load_predictions(task, layer):
	"""Load predictions from CSV file."""
	predictions_path = os.path.join("src", "codebert", task, f"layer{layer}", f"predictions_layer_{layer}.csv")
	if os.path.exists(predictions_path):
	try:
	# Read CSV with tab delimiter
	df = pd.read_csv(predictions_path, delimiter='\t')
	# Convert Token column to string to handle numeric tokens
	df['Token'] = df['Token'].astype(str)
	# Get the primary predicted cluster (Top 1)
	df['predicted_cluster'] = df['Top 1'].astype(str)
	# Create display strings for each token occurrence
	df['display_text'] = df.apply(
	lambda row: f"{row['Token']} (line {row['line_idx']}, pos {row['position_idx']}, cluster {row['predicted_cluster']})",
	axis=1
	)
	return df
	except Exception as e:
	st.error(f"Error loading predictions: {str(e)}")
	return None
	return None

	def load_clusters(task, layer):
	"""Load cluster data from clusters file."""
	clusters_path = os.path.join("src", "codebert", task, f"layer{layer}", "clusters-350.txt")
	if not os.path.exists(clusters_path):
	return None

	clusters = {}
	try:
	with open(clusters_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line: # Skip empty lines
	continue

	try:
	# Split on \|\|\| and get the parts
	parts = [p.strip() for p in line.split('\|\|\|')]
	if len(parts) == 5:
	token, occurrence, line_num, col_num, cluster_id = parts
	# Clean up cluster_id (remove any trailing pipes)
	cluster_id = cluster_id.split('\|')[0].strip()

	if not cluster_id.isdigit(): # Skip if cluster_id is not a valid number
	continue

	cluster_id = str(int(cluster_id)) # Normalize cluster ID

	# Store in clusters dict
	if cluster_id not in clusters:
	clusters[cluster_id] = []
	clusters[cluster_id].append({
	'token': token,
	'line_num': int(line_num),
	'col_num': int(col_num)
	})
	except Exception:
	continue

	except Exception as e:
	st.error(f"Error loading clusters: {str(e)}")
	return None

	return clusters

	def load_dev_sentences(task, layer):
	"""Load sentences from dev.in file."""
	dev_path = os.path.join("src", "codebert", task, f"layer{layer}", "dev.in")
	if not os.path.exists(dev_path):
	dev_path = os.path.join("src", "codebert", task, "dev.in")

	try:
	with open(dev_path, 'r', encoding='utf-8') as f:
	return f.readlines()
	except Exception:
	return []

	def load_train_sentences(task, layer):
	"""Load sentences from input.in (training set) file."""
	train_path = os.path.join("src", "codebert", task, f"layer{layer}", "input.in")
	if not os.path.exists(train_path):
	train_path = os.path.join("src", "codebert", task, "input.in")

	try:
	with open(train_path, 'r', encoding='utf-8') as f:
	return f.readlines()
	except Exception:
	return []

	def is_cls_token(token):
	"""Check if token is a CLS token (including numbered ones like [CLS]0)."""
	return token.startswith('[CLS]')

	def create_wordcloud(tokens_with_freq):
	"""Create wordcloud from tokens with their frequencies."""
	if not tokens_with_freq:
	return None

	try:
	# Set all frequencies to 1 to make all words the same size
	uniform_frequencies = {token: 1 for token in tokens_with_freq.keys()}
	wordcloud = WordCloud(
	width=800,
	height=400,
	background_color='#f9f9f9', # Very light gray, almost white
	prefer_horizontal=1, # All text horizontal
	relative_scaling=0, # This ensures uniform sizing
	min_font_size=35, # Ensure text is readable
	max_font_size=150, # Same as min to ensure uniform size
	).generate_from_frequencies(uniform_frequencies)
	return wordcloud
	except Exception as e:
	st.error(f"Error creating wordcloud: {str(e)}")
	return None

	def load_explanation_words(task, layer):
	"""Load explanation words file with labels."""
	file_path = os.path.join("src", "codebert", task, f"layer{layer}", f"explanation_words_layer{layer}.csv")
	try:
	df = pd.read_csv(file_path, sep='\t')
	# Create a dictionary mapping (token, line_idx, position_idx) to label
	token_to_label = {}
	for _, row in df.iterrows():
	key = (row['token'], row['line_idx'], row['position_idx'])
	token_to_label[key] = row['labels']
	return token_to_label
	except Exception as e:
	st.error(f"Error loading explanation words: {str(e)}")
	return {}

	def main():
	st.title("Token Analysis")

	# Task and Layer Selection
	col1, col2 = st.columns(2)

	with col1:
	available_tasks = get_available_tasks()
	selected_task = st.selectbox(
	"Select Task",
	available_tasks,
	key='task_selector'
	)

	with col2:
	if selected_task:
	available_layers = get_available_layers(selected_task)
	selected_layer = st.selectbox(
	"Select Layer",
	available_layers,
	key='layer_selector'
	)
	else:
	selected_layer = None

	# Only proceed if both task and layer are selected
	if selected_task and selected_layer is not None:
	predictions_df = load_predictions(selected_task, selected_layer)
	clusters = load_clusters(selected_task, selected_layer)
	dev_sentences = load_dev_sentences(selected_task, selected_layer) # Test set sentences
	train_sentences = load_train_sentences(selected_task, selected_layer) # Training set sentences
	token_labels = load_explanation_words(selected_task, selected_layer) # Load token labels

	if predictions_df is not None and clusters is not None:
	# Token selection with search
	search_token = st.text_input("Search tokens", key='token_search')

	# Filter display options based on search
	filtered_df = predictions_df
	if search_token:
	filtered_df = predictions_df[predictions_df['Token'].str.contains(search_token, case=False, na=False)]

	# Display token selection
	selected_token_display = st.selectbox(
	"Select a token occurrence",
	filtered_df['display_text'].tolist(),
	key='token_selector'
	)

	if selected_token_display:
	# Get the selected row from the dataframe
	selected_row = filtered_df[filtered_df['display_text'] == selected_token_display].iloc[0]
	token = selected_row['Token']
	line_idx = selected_row['line_idx']
	position_idx = selected_row['position_idx']

	# Get the label for the selected token
	token_key = (token, line_idx, position_idx)


	# Display token information
	st.header(f"Token: {token}")
	st.write(f"📍 Line: {selected_row['line_idx']}, Position: {selected_row['position_idx']}")
	st.metric("Predicted Cluster", selected_row['predicted_cluster'])
	if token_key in token_labels:
	st.write(f"Label: {token_labels[token_key]}")
	# Show original context from dev.in (test set)
	if dev_sentences and selected_row['line_idx'] < len(dev_sentences):
	st.subheader("Original Context (from test set)")
	st.code(dev_sentences[selected_row['line_idx']].strip())

	# Show wordcloud for the cluster (from training set)
	if clusters and selected_row['predicted_cluster'] in clusters:
	token_frequencies = {}
	for token_info in clusters[selected_row['predicted_cluster']]:
	token = token_info['token']
	token_frequencies[token] = 1 # Set all frequencies to 1 for uniform size

	if token_frequencies:
	st.subheader("Cluster Word Cloud (from training set)")
	wordcloud = create_wordcloud(token_frequencies)
	if wordcloud:
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	st.pyplot(plt)

	# Show similar contexts from the cluster (from training set)
	with st.expander(f"👀 View Similar Contexts (from training set, Cluster {selected_row['predicted_cluster']})"):
	if clusters and selected_row['predicted_cluster'] in clusters:
	shown_contexts = set()

	for token_info in clusters[selected_row['predicted_cluster']]:
	line_num = token_info['line_num']
	if line_num >= 0 and line_num < len(train_sentences):
	context = train_sentences[line_num].strip()
	if context not in shown_contexts:
	st.code(context)
	shown_contexts.add(context)

	if not shown_contexts:
	st.info("No similar contexts found in this cluster from the training set.")

	if __name__ == "__main__":
	main()