Spaces:

entfane
/

gpt2-harmful-classifier

Sleeping

App Files Files Community

entfane commited on Feb 25

Commit

3ba5f1a

verified ·

1 Parent(s): d7cb09b

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -40

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
 import numpy as np
-import pandas as pd
 from transformers import AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
@@ -18,71 +17,143 @@ model.eval()
 # ── Core inference ───────────────────────────────────────────────────────────
-def analyze(user_message, assistant_reply):
     messages = [
-        {"role": "system",    "content": ""},
         {"role": "user",      "content": user_message},
         {"role": "assistant", "content": assistant_reply},
     ]
-    text = tokenizer.apply_chat_template(messages, tokenize=False)
-    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(DEVICE)
-    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
     with torch.no_grad():
         _, _, values = model(input_ids)
-    scores = torch.sigmoid(values[0]).cpu().numpy()
-    # Only keep tokens that belong to the assistant reply
-    # Find where the assistant reply starts in the token list
-    reply_tokens = tokenizer(assistant_reply, return_tensors="pt").input_ids[0].tolist()
-    n_reply = len(reply_tokens)
-    tokens = tokens[-n_reply:]
-    scores = scores[-n_reply:]
-    def clean(tok):
-        return tok.replace("Ġ", " ").replace("Ċ", "\\n").strip() or tok
-    labels = [f"{clean(tok)} [{i}]" for i, tok in enumerate(tokens)]
-    df = pd.DataFrame({"token": labels, "value score": scores.tolist(), "order": list(range(len(tokens)))})
-    df = df.sort_values("order").drop(columns="order")
-    stats = (
         f"**Tokens:** {len(tokens)}  |  "
         f"**Min:** {scores.min():.4f}  |  "
         f"**Max:** {scores.max():.4f}  |  "
-        f"**Mean:** {scores.mean():.4f}"
     )
-    return df, stats
 # ── UI ───────────────────────────────────────────────────────────────────────
-with gr.Blocks(theme=gr.themes.Soft(), title="Value Head Visualizer") as demo:
-    gr.Markdown("# 🧠 Value Head Visualizer")
-    gr.Markdown("Per-token sigmoid value scores from `entfane/gpt2_constitutional_classifier_with_value_head`.")
     with gr.Row():
-        user_in = gr.Textbox(label="User message", value="How are you doing?", lines=2)
-        asst_in = gr.Textbox(label="Assistant reply", value="I am good", lines=2)
-    run_btn = gr.Button("▶ Analyze", variant="primary")
-    stats_out = gr.Markdown()
-    bar_out = gr.BarPlot(
-        x="token",
-        y="value score",
-        title="Per-token value scores",
-        tooltip=["token", "value score"],
-        height=500,
-        y_lim=[0, 1],
-        x_label_angle=-45,  # angled labels so they don't overlap
     )
-    run_btn.click(fn=analyze, inputs=[user_in, asst_in], outputs=[bar_out, stats_out])
-    demo.load(fn=analyze, inputs=[user_in, asst_in], outputs=[bar_out, stats_out])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 import numpy as np
 from transformers import AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 # ── Core inference ───────────────────────────────────────────────────────────
+def get_value_scores(system_prompt: str, user_message: str, assistant_reply: str):
     messages = [
+        {"role": "system",    "content": system_prompt},
         {"role": "user",      "content": user_message},
         {"role": "assistant", "content": assistant_reply},
     ]
+    input_ids = tokenizer.apply_chat_template(
+        messages, tokenize=True, return_tensors="pt"
+    ).to(DEVICE)
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
     with torch.no_grad():
         _, _, values = model(input_ids)
+    scores = torch.sigmoid(values[0]).cpu().numpy()   # shape: (seq_len,)
+    return tokens, scores
+# ── Build the HTML heatmap ───────────────────────────────────────────────────
+def lerp_color(lo, hi, t):
+    return tuple(int(lo[i] + (hi[i] - lo[i]) * t) for i in range(3))
+def tokens_to_html(tokens, scores):
+    lo_rgb  = (15,  23,  42)   # dark slate  (low  value)
+    hi_rgb  = (56, 189, 248)   # sky-400      (high value)
+    bg_rgb  = (30,  41,  59)   # slate-800
+    rows = []
+    for tok, sc in zip(tokens, scores):
+        t        = float(sc)
+        r, g, b  = lerp_color(lo_rgb, hi_rgb, t)
+        lum      = 0.299*r + 0.587*g + 0.114*b
+        fg       = "#0f172a" if lum > 140 else "#e2e8f0"
+        label    = tok.replace("Ġ", "·").replace("<", "&lt;").replace(">", "&gt;")
+        rows.append(
+            f'<span title="score: {t:.4f}" style="'
+            f'background:rgb({r},{g},{b});color:{fg};'
+            f'padding:3px 6px;margin:2px;border-radius:4px;'
+            f'display:inline-block;font-family:monospace;font-size:13px;'
+            f'cursor:default;">{label}</span>'
+        )
+    body = " ".join(rows)
+    return (
+        f'<div style="background:rgb{bg_rgb};padding:16px;border-radius:10px;'
+        f'line-height:2.2;word-break:break-word;">{body}</div>'
+    )
+# ── Bar-chart data for Gradio BarPlot ────────────────────────────────────────
+def build_bar_data(tokens, scores):
+    import pandas as pd
+    labels = [f"{t.replace('Ġ','·')} [{i}]" for i, t in enumerate(tokens)]
+    return pd.DataFrame({"token": labels, "value score": scores.tolist()})
+# ── Main handler ─────────────────────────────────────────────────────────────
+def analyze(system_prompt, user_message, assistant_reply):
+    tokens, scores = get_value_scores(system_prompt, user_message, assistant_reply)
+    heatmap_html = tokens_to_html(tokens, scores)
+    bar_df       = build_bar_data(tokens, scores)
+    stats_md = (
         f"**Tokens:** {len(tokens)}  |  "
         f"**Min:** {scores.min():.4f}  |  "
         f"**Max:** {scores.max():.4f}  |  "
+        f"**Mean:** {scores.mean():.4f}  |  "
+        f"**Std:** {scores.std():.4f}"
     )
+    return heatmap_html, bar_df, stats_md
 # ── UI ───────────────────────────────────────────────────────────────────────
+CSS = """
+body { font-family: 'IBM Plex Mono', monospace; }
+#title { text-align: center; margin-bottom: 0.5rem; }
+#subtitle { text-align: center; color: #94a3b8; margin-top: 0; }
+.gr-button-primary { background: #0ea5e9 !important; border: none !important; }
+"""
+with gr.Blocks(theme=gr.themes.Base(), css=CSS, title="Value Head Visualizer") as demo:
+    gr.Markdown("# 🧠 GPT-2 Value Head Visualizer", elem_id="title")
+    gr.Markdown(
+        "Inspect per-token **value scores** (sigmoid-activated) from a "
+        "`AutoModelForCausalLMWithValueHead` GPT-2 model.",
+        elem_id="subtitle",
+    )
     with gr.Row():
+        with gr.Column(scale=1):
+            system_in = gr.Textbox(
+                label="System prompt",
+                placeholder="(optional)",
+                lines=2,
+            )
+            user_in = gr.Textbox(
+                label="User message",
+                value="How are you doing?",
+                lines=3,
+            )
+            asst_in = gr.Textbox(
+                label="Assistant reply",
+                value="I am good",
+                lines=3,
+            )
+            run_btn = gr.Button("▶ Analyze", variant="primary")
+        with gr.Column(scale=2):
+            stats_out   = gr.Markdown()
+            heatmap_out = gr.HTML(label="Token heatmap  (hover for exact score)")
+            bar_out     = gr.BarPlot(
+                x="token",
+                y="value score",
+                title="Per-token value scores",
+                tooltip=["token", "value score"],
+                height=300,
+                y_lim=[0, 1],
+            )
+    run_btn.click(
+        fn=analyze,
+        inputs=[system_in, user_in, asst_in],
+        outputs=[heatmap_out, bar_out, stats_out],
     )
+    # Run on load with defaults
+    demo.load(
+        fn=analyze,
+        inputs=[system_in, user_in, asst_in],
+        outputs=[heatmap_out, bar_out, stats_out],
+    )
 if __name__ == "__main__":
     demo.launch()