Ric commited on
Commit ·
c7bcf11
1
Parent(s): 62958a5
fix: cap method chart at 100%, redesign efficiency scatter with smaller dots and quadrant shading
Browse files
app.py
CHANGED
|
@@ -479,7 +479,7 @@ def build_method_comparison(base_model: str) -> go.Figure:
|
|
| 479 |
title=f"All Methods Tested on {base_model}",
|
| 480 |
xaxis_title="Tool / Method",
|
| 481 |
yaxis_title="ASR (%)",
|
| 482 |
-
yaxis_range=[0,
|
| 483 |
plot_bgcolor="#0e1117",
|
| 484 |
paper_bgcolor="#0e1117",
|
| 485 |
font_color="#c4c4c4",
|
|
@@ -510,40 +510,61 @@ def build_efficiency_scatter() -> go.Figure:
|
|
| 510 |
|
| 511 |
df = pd.DataFrame(all_results)
|
| 512 |
|
| 513 |
-
fig =
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
|
| 525 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
fig.add_shape(type="line", x0=1.0, x1=1.0, y0=0, y1=100,
|
| 527 |
-
line=dict(color="#
|
| 528 |
fig.add_shape(type="line", x0=0, x1=12, y0=50, y1=50,
|
| 529 |
-
line=dict(color="#
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
fig.add_annotation(x=
|
| 533 |
-
showarrow=False, font=dict(color="#
|
| 534 |
-
fig.add_annotation(x=
|
| 535 |
-
showarrow=False, font=dict(color="#
|
|
|
|
|
|
|
| 536 |
|
| 537 |
fig.update_layout(
|
| 538 |
-
title="Abliteration Efficiency: ASR vs KL Divergence
|
| 539 |
-
xaxis_title="KL Divergence (lower = less damage)",
|
| 540 |
yaxis_title="ASR % (higher = more refusals removed)",
|
| 541 |
-
yaxis_range=[0,
|
|
|
|
| 542 |
plot_bgcolor="#0e1117",
|
| 543 |
paper_bgcolor="#0e1117",
|
| 544 |
font_color="#c4c4c4",
|
| 545 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 546 |
margin=dict(t=70, b=40),
|
|
|
|
| 547 |
)
|
| 548 |
return fig
|
| 549 |
|
|
|
|
| 479 |
title=f"All Methods Tested on {base_model}",
|
| 480 |
xaxis_title="Tool / Method",
|
| 481 |
yaxis_title="ASR (%)",
|
| 482 |
+
yaxis_range=[0, 100],
|
| 483 |
plot_bgcolor="#0e1117",
|
| 484 |
paper_bgcolor="#0e1117",
|
| 485 |
font_color="#c4c4c4",
|
|
|
|
| 510 |
|
| 511 |
df = pd.DataFrame(all_results)
|
| 512 |
|
| 513 |
+
fig = go.Figure()
|
| 514 |
+
|
| 515 |
+
for source, color, symbol in [("Our Tools", "#e94560", "circle"), ("Community", "#95d5b2", "diamond")]:
|
| 516 |
+
subset = df[df["Source"] == source]
|
| 517 |
+
if subset.empty:
|
| 518 |
+
continue
|
| 519 |
+
fig.add_trace(go.Scatter(
|
| 520 |
+
x=subset["KL"],
|
| 521 |
+
y=subset["ASR (%)"],
|
| 522 |
+
mode="markers",
|
| 523 |
+
name=source,
|
| 524 |
+
marker=dict(
|
| 525 |
+
color=color,
|
| 526 |
+
size=10,
|
| 527 |
+
symbol=symbol,
|
| 528 |
+
line=dict(width=1, color="#222"),
|
| 529 |
+
),
|
| 530 |
+
text=subset["Label"],
|
| 531 |
+
hovertemplate="<b>%{text}</b><br>ASR: %{y:.0f}%<br>KL: %{x:.4f}<extra></extra>",
|
| 532 |
+
))
|
| 533 |
|
| 534 |
+
# Quadrant shading
|
| 535 |
+
fig.add_shape(type="rect", x0=0, x1=1.0, y0=50, y1=100,
|
| 536 |
+
fillcolor="rgba(149,213,178,0.06)", line=dict(width=0))
|
| 537 |
+
fig.add_shape(type="rect", x0=1.0, x1=12, y0=50, y1=100,
|
| 538 |
+
fillcolor="rgba(255,214,10,0.04)", line=dict(width=0))
|
| 539 |
+
fig.add_shape(type="rect", x0=0, x1=12, y0=0, y1=50,
|
| 540 |
+
fillcolor="rgba(233,69,96,0.04)", line=dict(width=0))
|
| 541 |
+
|
| 542 |
+
# Quadrant lines
|
| 543 |
fig.add_shape(type="line", x0=1.0, x1=1.0, y0=0, y1=100,
|
| 544 |
+
line=dict(color="#333", width=1, dash="dot"))
|
| 545 |
fig.add_shape(type="line", x0=0, x1=12, y0=50, y1=50,
|
| 546 |
+
line=dict(color="#333", width=1, dash="dot"))
|
| 547 |
+
|
| 548 |
+
# Quadrant labels
|
| 549 |
+
fig.add_annotation(x=0.3, y=97, text="Best: High ASR, Low KL",
|
| 550 |
+
showarrow=False, font=dict(color="#95d5b2", size=9))
|
| 551 |
+
fig.add_annotation(x=6, y=97, text="Effective but Damaged",
|
| 552 |
+
showarrow=False, font=dict(color="#ffd60a", size=9))
|
| 553 |
+
fig.add_annotation(x=6, y=5, text="Failed",
|
| 554 |
+
showarrow=False, font=dict(color="#666", size=9))
|
| 555 |
|
| 556 |
fig.update_layout(
|
| 557 |
+
title="Abliteration Efficiency: ASR vs KL Divergence",
|
| 558 |
+
xaxis_title="KL Divergence (lower = less damage to model)",
|
| 559 |
yaxis_title="ASR % (higher = more refusals removed)",
|
| 560 |
+
yaxis_range=[0, 102],
|
| 561 |
+
xaxis_range=[-0.2, max(df["KL"].max() * 1.1, 2)],
|
| 562 |
plot_bgcolor="#0e1117",
|
| 563 |
paper_bgcolor="#0e1117",
|
| 564 |
font_color="#c4c4c4",
|
| 565 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 566 |
margin=dict(t=70, b=40),
|
| 567 |
+
hoverlabel=dict(bgcolor="#1a1a2e", font_size=12),
|
| 568 |
)
|
| 569 |
return fig
|
| 570 |
|