Spaces:

vedant2905
/

Salient_3_problems

Build error

App Files Files Community

vedant2905 commited on May 6, 2025

Commit

eeee269

verified ·

1 Parent(s): ef99f5c

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +46 -140

src/streamlit_app.py CHANGED Viewed

@@ -12,25 +12,11 @@ st.set_page_config(
     layout="wide"
 )
-# Page title with custom styling
-st.markdown('<h1 class="main-header">🔍 Code Token Cluster Visualization</h1>', unsafe_allow_html=True)
-st.markdown("""
-    <p>Explore token clusters from language model representations.
-    Select a token to view its cluster information and contexts.</p>
-""", unsafe_allow_html=True)
-# Create sidebar for input controls
-with st.sidebar:
-    st.markdown("## 🛠️ Controls")
-    st.markdown("---")
 # Functions to load data
 @st.cache_data
 def load_predictions(file_path):
     """Load the predictions CSV file."""
-    # Prepend src to the file path
     full_path = os.path.join("src", file_path)
-    # Read the file with all columns as string type initially
     df = pd.read_csv(full_path, sep="\t", dtype=str)
     # Convert numeric columns safely
@@ -71,28 +57,26 @@ def create_wordcloud(tokens):
     if not tokens:
         return None
-    # Create a dictionary with equal weights for all tokens
     token_weights = {token: 1 for token in set(tokens)}
     wordcloud = WordCloud(
-    width=1000,
-    height=500,
-    background_color='#FFF0DB',
-    prefer_horizontal=1,
-    min_font_size=10,  # Reduced to allow for more flexible scaling
-    max_font_size=150,  # Increased for better range
-    relative_scaling=0.5,  # Added relative scaling to vary sizes based on frequency
-    collocations=False,
-    margin=1,
-    random_state=42,
-    scale=2,  # Increased scale for better resolution
-    repeat=False,
-    max_words=2000,
-    regexp=r"\w[\w' ]+",
-).generate_from_frequencies(token_weights)
-    # Create a new figure with tight layout and adjusted size
-    fig = plt.figure(figsize=(16, 8))  # Increased figure size
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
     plt.tight_layout(pad=0)
@@ -102,7 +86,6 @@ def create_wordcloud(tokens):
 def load_dev_sentences():
     """Load sentences from dev.in file."""
     try:
-        # Try different possible locations of dev.in
         possible_paths = [
             os.path.join("src", "codebert", "compile_error", "dev.in"),
             os.path.join("src", "codebert", "language_classification", "layer11", "dev.in"),
@@ -118,73 +101,20 @@ def load_dev_sentences():
         st.error(f"Error loading dev.in file: {str(e)}")
         return []
-def get_available_models():
-    # Check in the src directory for the codebert folder
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    model_path = os.path.join("src", "codebert")
-    if os.path.exists(model_path):
-        return ["codebert"]
-    return []
-def get_available_tasks(model):
-    if not model:
-        return []
-    model_dir = os.path.join("src", model)
-    if os.path.exists(model_dir):
-        return [d for d in os.listdir(model_dir)
-                if os.path.isdir(os.path.join(model_dir, d))]
-    return []
-def get_available_layers(model, task):
-    if not model or not task:
-        return []
-    task_dir = os.path.join("src", model, task)
-    if os.path.exists(task_dir):
-        layers = [d for d in os.listdir(task_dir)
-                if os.path.isdir(os.path.join(task_dir, d)) and d.startswith('layer')]
-        return sorted(layers, key=lambda x: int(x.replace('layer', '')))
-    return []
 def main():
     st.title("Code Token Cluster Visualization")
-    # Model selection with null checks
-    available_models = get_available_models()
-    if not available_models:
-        st.error("No models found in the workspace")
-        return
-    model = st.selectbox(
-        "Select Model",
-        options=available_models,
-        index=0
-    )
-    # Task selection with null checks
-    tasks = get_available_tasks(model)
-    if not tasks:
-        st.error(f"No tasks found for model {model}")
-        return
-    task = st.selectbox(
-        "Select Task",
-        options=tasks,
-        index=tasks.index("language_classification") if "language_classification" in tasks else 0
-    )
-    # Layer selection with null checks
-    layers = get_available_layers(model, task)
-    if not layers:
-        st.error(f"No layers found for {model}/{task}")
-        return
-    layer = st.selectbox(
-        "Select Layer",
-        options=layers,
-        index=layers.index("layer6") if "layer6" in layers else 0
-    )
     # Fix the file paths
     layer_dir = os.path.join(model, task, layer)
@@ -198,44 +128,29 @@ def main():
         clusters = load_clusters(clusters_file)
         sentences = load_input_data(input_file)
-        # Get tokens and their predicted clusters from predictions file
-        token_predictions = dict(zip(predictions_df['Token'], predictions_df['Top 1']))
-        # Display dataset statistics in an expandable section
-        with st.expander("📊 Dataset Statistics", expanded=False):
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Total Tokens", f"{len(predictions_df):,}")
-            with col2:
-                st.metric("Total Clusters", f"{len(clusters):,}")
-            with col3:
-                avg_tokens = sum(len(tokens) for tokens in clusters.values()) / max(len(clusters), 1)
-                st.metric("Avg. Tokens per Cluster", f"{avg_tokens:.1f}")
-        # Create token selector in sidebar
-        with st.sidebar:
-            st.markdown("### 🔤 Token Selection")
-            # Convert all tokens to strings before sorting to avoid type comparison issues
-            all_tokens = sorted([str(token) for token in predictions_df['Token'].unique()],
-                               key=lambda x: (x.lower() if isinstance(x, str) else str(x)))
-            # Add a search box to filter tokens
-            token_search = st.text_input("🔍 Search tokens", "")
-            if token_search:
-                filtered_tokens = [t for t in all_tokens if token_search.lower() in t.lower()]
-                token_options = filtered_tokens if filtered_tokens else all_tokens
-                if not filtered_tokens:
-                    st.warning(f"No tokens matching '{token_search}'")
-            else:
-                token_options = all_tokens
-            selected_token = st.selectbox(
-                "Select a token:",
-                token_options,
-                index=0 if token_options else None
-            )
         # Main content
         if selected_token:
@@ -243,7 +158,6 @@ def main():
             token_instances = predictions_df[predictions_df['Token'] == selected_token]
             if not token_instances.empty:
-                # Simple header and token display
                 st.title(f"Token: {selected_token}")
                 # Get most frequent cluster (Top 1) for this token
@@ -298,14 +212,6 @@ def main():
                         st.info("No contexts found in this cluster")
             else:
                 st.warning(f"No instances found for token: {selected_token}")
-        else:
-            # Show welcome message when no token is selected
-            st.markdown("""
-                <div style="text-align: center; margin-top: 50px; color: #757575;">
-                    <h3>👈 Select a token from the sidebar to begin</h3>
-                    <p>The visualization will show cluster information and code contexts.</p>
-                </div>
-            """, unsafe_allow_html=True)
     except Exception as e:
         st.error("An error occurred while processing the data")

     layout="wide"
 )
 # Functions to load data
 @st.cache_data
 def load_predictions(file_path):
     """Load the predictions CSV file."""
     full_path = os.path.join("src", file_path)
     df = pd.read_csv(full_path, sep="\t", dtype=str)
     # Convert numeric columns safely
     if not tokens:
         return None
     token_weights = {token: 1 for token in set(tokens)}
     wordcloud = WordCloud(
+        width=1000,
+        height=500,
+        background_color='#FFF0DB',
+        prefer_horizontal=1,
+        min_font_size=10,
+        max_font_size=150,
+        relative_scaling=0.5,
+        collocations=False,
+        margin=1,
+        random_state=42,
+        scale=2,
+        repeat=False,
+        max_words=2000,
+        regexp=r"\w[\w' ]+"
+    ).generate_from_frequencies(token_weights)
+    fig = plt.figure(figsize=(16, 8))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
     plt.tight_layout(pad=0)
 def load_dev_sentences():
     """Load sentences from dev.in file."""
     try:
         possible_paths = [
             os.path.join("src", "codebert", "compile_error", "dev.in"),
             os.path.join("src", "codebert", "language_classification", "layer11", "dev.in"),
         st.error(f"Error loading dev.in file: {str(e)}")
         return []
 def main():
     st.title("Code Token Cluster Visualization")
+    # Model selection
+    available_models = ["codebert"]
+    model = st.selectbox("Select Model", available_models)
+    # Task selection
+    tasks = ["language_classification"]
+    task = st.selectbox("Select Task", tasks)
+    # Layer selection
+    layers = ["layer6", "layer11"]
+    layer = st.selectbox("Select Layer", layers)
     # Fix the file paths
     layer_dir = os.path.join(model, task, layer)
         clusters = load_clusters(clusters_file)
         sentences = load_input_data(input_file)
+        # Create token selector in sidebar
+        st.sidebar.title("Token Selection")
+        # Convert all tokens to strings before sorting
+        all_tokens = sorted([str(token) for token in predictions_df['Token'].unique()],
+                           key=lambda x: (x.lower() if isinstance(x, str) else str(x)))
+        # Add a search box to filter tokens
+        token_search = st.sidebar.text_input("Search tokens")
+        if token_search:
+            filtered_tokens = [t for t in all_tokens if token_search.lower() in t.lower()]
+            token_options = filtered_tokens if filtered_tokens else all_tokens
+            if not filtered_tokens:
+                st.sidebar.warning(f"No tokens matching '{token_search}'")
+        else:
+            token_options = all_tokens
+        selected_token = st.sidebar.selectbox(
+            "Select a token:",
+            token_options,
+            index=0 if token_options else None
+        )
         # Main content
         if selected_token:
             token_instances = predictions_df[predictions_df['Token'] == selected_token]
             if not token_instances.empty:
                 st.title(f"Token: {selected_token}")
                 # Get most frequent cluster (Top 1) for this token
                         st.info("No contexts found in this cluster")
             else:
                 st.warning(f"No instances found for token: {selected_token}")
     except Exception as e:
         st.error("An error occurred while processing the data")