Spaces:

benchaffe
/

llm-steering-gpt-2

Sleeping

App Files Files Community

benchaffe commited on Dec 17, 2025

Commit

c196b6f

verified ·

1 Parent(s): 7c820c5

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +10 -25

src/streamlit_app.py CHANGED Viewed

@@ -19,8 +19,6 @@ HOOK_POINT = "blocks.6.hook_resid_pre"
 DEVICE = "cpu" # Force CPU for free tier stability
 # --- CURATED INTERESTING FEATURES ---
-# Dictionary of known interesting features for this specific SAE
-# (Feature ID, Description, Suggested Steering Strength)
 INTERESTING_FEATURES = {
     "Select a feature...": (None, "Normal model behavior", 0),
     "The 'Love' Feature": (1876, "Fires on words like love, passion, heart", 60.0),
@@ -31,32 +29,31 @@ INTERESTING_FEATURES = {
 }
 # --- LOADER FUNCTIONS (CACHED) ---
 @st.cache_resource
 def load_resources():
-    st.toast("Loading GPT-2 Small... (This may take 30s)", icon="⏳")
     model = HookedTransformer.from_pretrained(MODEL_NAME, device=DEVICE)
-    st.toast("Loading Sparse Autoencoder...", icon="⏳")
     sae, _, _ = SAE.from_pretrained(release=SAE_RELEASE, sae_id=SAE_ID, device=DEVICE)
     return model, sae
-# Load resources immediately
 try:
-    model, sae = load_resources()
     st.success("System Ready: GPT-2 Small + SAE Layer 6 Loaded")
 except Exception as e:
     st.error(f"Error loading models: {e}")
     st.stop()
 # --- MAIN LAYOUT ---
-col1, col2 = st.columns([1, 1.5]) # Make right column slightly wider for text
 # --- COLUMN 1: CONTROLS ---
 with col1:
     st.subheader("1. 🎛️ Control Panel")
-    # Selection Dropdown
     selected_label = st.selectbox(
         "Choose a Concept to Inject:",
         list(INTERESTING_FEATURES.keys())
@@ -68,8 +65,6 @@ with col1:
     if feature_id is not None:
         st.write(f"**Internal Feature ID:** `{feature_id}`")
-        # Slider for Strength
         steering_coeff = st.slider(
             "Injection Strength",
             min_value=-150.0,
@@ -89,25 +84,20 @@ with col2:
     if st.button("Generate Output", type="primary"):
-        # Define the Steering Hook
         def steering_hook(resid_pre, hook):
-            # resid_pre shape: [batch, pos, d_model]
             if feature_id is not None:
-                # Get the decoder vector for the specific feature
                 steering_vector = sae.W_dec[feature_id]
-                # Inject the vector into the stream
                 resid_pre = resid_pre + (steering_coeff * steering_vector)
             return resid_pre
         with st.spinner("Running Inference..."):
-            # 1. Normal Generation (Control)
             st.markdown("### ⚪ Normal Output")
-            # Clear hooks just in case
             model.reset_hooks()
             normal_out = model.generate(prompt, max_new_tokens=25, verbose=False, temperature=0.7)
             st.write(normal_out)
-            # 2. Steered Generation (Test)
             if feature_id is not None:
                 st.markdown(f"### 🔵 Steered Output ('{selected_label}')")
                 with model.hooks(fwd_hooks=[(HOOK_POINT, steering_hook)]):
@@ -116,10 +106,5 @@ with col2:
             else:
                 st.caption("Select a feature to see the steered output.")
-# --- FOOTER ---
 st.divider()
-st.markdown("""
-**How this works:** We use a Sparse Autoencoder (SAE) to decompose GPT-2's internal activations into interpretable features.
-When you select a feature, we mathematically add its vector to the model's residual stream during generation, forcing the model to "think" about that concept.
-*Built with `sae_lens` and `transformer_lens`.*
-""")

 DEVICE = "cpu" # Force CPU for free tier stability
 # --- CURATED INTERESTING FEATURES ---
 INTERESTING_FEATURES = {
     "Select a feature...": (None, "Normal model behavior", 0),
     "The 'Love' Feature": (1876, "Fires on words like love, passion, heart", 60.0),
 }
 # --- LOADER FUNCTIONS (CACHED) ---
+# FIX: Removed st.toast from inside this cached function
 @st.cache_resource
 def load_resources():
+    # We rely on the caller to show the spinner/toast
     model = HookedTransformer.from_pretrained(MODEL_NAME, device=DEVICE)
     sae, _, _ = SAE.from_pretrained(release=SAE_RELEASE, sae_id=SAE_ID, device=DEVICE)
     return model, sae
+# --- MAIN EXECUTION ---
+# Move UI feedback here, OUTSIDE the cached function
 try:
+    with st.spinner("Loading GPT-2 Small & SAE (this may take 30s)..."):
+        model, sae = load_resources()
     st.success("System Ready: GPT-2 Small + SAE Layer 6 Loaded")
 except Exception as e:
     st.error(f"Error loading models: {e}")
     st.stop()
 # --- MAIN LAYOUT ---
+col1, col2 = st.columns([1, 1.5])
 # --- COLUMN 1: CONTROLS ---
 with col1:
     st.subheader("1. 🎛️ Control Panel")
     selected_label = st.selectbox(
         "Choose a Concept to Inject:",
         list(INTERESTING_FEATURES.keys())
     if feature_id is not None:
         st.write(f"**Internal Feature ID:** `{feature_id}`")
         steering_coeff = st.slider(
             "Injection Strength",
             min_value=-150.0,
     if st.button("Generate Output", type="primary"):
         def steering_hook(resid_pre, hook):
             if feature_id is not None:
                 steering_vector = sae.W_dec[feature_id]
                 resid_pre = resid_pre + (steering_coeff * steering_vector)
             return resid_pre
         with st.spinner("Running Inference..."):
+            # 1. Normal Generation
             st.markdown("### ⚪ Normal Output")
             model.reset_hooks()
             normal_out = model.generate(prompt, max_new_tokens=25, verbose=False, temperature=0.7)
             st.write(normal_out)
+            # 2. Steered Generation
             if feature_id is not None:
                 st.markdown(f"### 🔵 Steered Output ('{selected_label}')")
                 with model.hooks(fwd_hooks=[(HOOK_POINT, steering_hook)]):
             else:
                 st.caption("Select a feature to see the steered output.")
 st.divider()
+st.caption("Built with transformer_lens and sae_lens.")