xianghe's picture
update
96efaf9
from huggingface_hub import hf_hub_download
import gradio as gr
import pandas as pd
import numpy as np
# ๅ…จๅฑ€็ผ“ๅญ˜ๅ˜้‡
_cached_df = None
_cached_models = None
_cached_jailbreak_types = None
_cached_attack_methods = None
_cached_defense_methods = None
# Model icon URLs - ไฝฟ็”จๆททๅˆ CDN ็ญ–็•ฅ
MODEL_ICON_URLS = {
'claude': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/claude-color.png',
'gpt': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/openai.png',
'gemini': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/gemini-color.png',
'grok': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/x-color.png',
'llama': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/meta-color.png',
'qwen': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/tongyi-color.png',
'deepseek': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/deep-seek-color.png',
'glm': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/zhipu-color.png',
'doubao': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/doubao-color.png',
'kimi': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/moonshot.png',
'ernie': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/wenxin-color.png',
'ds': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/deep-seek-color.png',
'o3': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/openai.png',
'gemma': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/gemma-color.png',
'phi': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/microsoft-color.png',
}
def get_model_icon_html(model_name):
"""Get HTML image tag for model icon based on name"""
model_lower = model_name.lower()
icon_url = None
# ็‰นๆฎŠๅค„็† DS- ๅผ€ๅคด็š„ๆจกๅž‹(DeepSeek็ฎ€็งฐ)
if model_lower.startswith('ds-'):
icon_url = MODEL_ICON_URLS.get('deepseek')
else:
# ๆŒ‰ๅ…ณ้”ฎ่ฏๅŒน้…
for key, url in MODEL_ICON_URLS.items():
if key in model_lower:
icon_url = url
break
if icon_url:
return f'<img src="{icon_url}" width="20" height="20" style="vertical-align: middle; margin-right: 8px;" onerror="this.style.display=\'none\'">'
return ''
def format_model_name_with_icon(model_name):
"""Add icon to model name using HTML"""
icon_html = get_model_icon_html(model_name)
return f"{icon_html}{model_name}"
# Load CSV data with caching
def load_csv_data():
global _cached_df
if _cached_df is None:
import gzip
try:
# Try to load compressed file first
with gzip.open("panda-bench.csv.gz", 'rt', encoding='utf-8') as f:
_cached_df = pd.read_csv(f)
except FileNotFoundError:
try:
# Fallback to uncompressed file
_cached_df = pd.read_csv("panda-bench.csv")
except FileNotFoundError:
# Create dummy data for testing
_cached_df = pd.DataFrame({
'model_name': ['claude-3-5-sonnet', 'gpt-4o', 'gemini-pro'],
'attack_method': ['DEV_MODE_V2', 'DEV_MODE_V2', 'DEV_MODE_V2'],
'jailbreak_type': ['Expert advice', 'Economic harm', 'Expert advice'],
'GCG': [21.27, 40, 35],
'defense_method': ['Paraphrase', 'SelfRemind', 'Paraphrase']
})
print(f"CSV loaded with {len(_cached_df)} rows")
return _cached_df
def get_unique_models():
global _cached_models
if _cached_models is None:
df = load_csv_data()
_cached_models = sorted(df['model_name'].unique())
return _cached_models
def get_unique_jailbreak_types():
global _cached_jailbreak_types
if _cached_jailbreak_types is None:
df = load_csv_data()
_cached_jailbreak_types = sorted(df['jailbreak_type'].unique())
return _cached_jailbreak_types
def get_attack_methods():
global _cached_attack_methods
if _cached_attack_methods is None:
df = load_csv_data()
_cached_attack_methods = sorted(df['attack_method'].unique())
return _cached_attack_methods
def get_evaluation_methods():
return ["GCG", "PAIR_gpt-4o-2024-11-20", "PAIR_Qwen_Qwen2.5-72B-Instruct", "PAIR_meta-llama_Llama-3.3-70B-Instruct"]
def get_defense_methods():
global _cached_defense_methods
if _cached_defense_methods is None:
df = load_csv_data()
_cached_defense_methods = sorted(df['defense_method'].unique())
return _cached_defense_methods
def format_data(value):
if isinstance(value, (int, float)) and not np.isnan(value):
return round(value, 2)
return "N/A"
def create_styled_dataframe(headers, data, sort_by_col=None):
"""Create a styled dataframe with color highlighting for top 3"""
if not data:
return gr.Dataframe(headers=headers, value=data, interactive=False,
datatype=["html"] + ["number"] * (len(headers) - 1))
df = pd.DataFrame(data, columns=headers)
# If sort_by_col is specified, sort by that column
if sort_by_col and sort_by_col in df.columns:
# Convert to numeric for sorting
df[sort_by_col] = pd.to_numeric(df[sort_by_col], errors='coerce')
df = df.sort_values(by=sort_by_col, ascending=True) # Lower is better
df = df.reset_index(drop=True)
# Add Rank column with medal icons for top 3
rank_column = []
for i in range(len(df)):
if i == 0:
rank_column.append("๐Ÿฅ‡ 1")
elif i == 1:
rank_column.append("๐Ÿฅˆ 2")
elif i == 2:
rank_column.append("๐Ÿฅ‰ 3")
else:
rank_column.append(str(i + 1))
# Insert Rank column at the beginning
df.insert(0, 'Rank', rank_column)
headers = ['Rank'] + headers
# Add icons to model names - now check second column if Rank exists
model_col_idx = 1 if 'Rank' in headers else 0
if 'Model' in headers[model_col_idx]:
df.iloc[:, model_col_idx] = df.iloc[:, model_col_idx].apply(format_model_name_with_icon)
# Convert back to list for Gradio
styled_data = df.values.tolist()
# Set datatype: Rank and Model columns are HTML, rest are numbers
if 'Rank' in headers:
datatypes = ["html", "html"] + ["number"] * (len(headers) - 2)
else:
datatypes = ["html"] + ["number"] * (len(headers) - 1)
return gr.Dataframe(
headers=headers,
value=styled_data,
interactive=False,
datatype=datatypes,
wrap=True
)
# Model view: show performance across different attack methods for selected models
def filter_by_model(selected_models, selected_jailbreak_types):
df = load_csv_data()
if not selected_models:
selected_models = get_unique_models()
if not selected_jailbreak_types:
selected_jailbreak_types = get_unique_jailbreak_types()
filtered_df = df[
(df['model_name'].isin(selected_models)) &
(df['jailbreak_type'].isin(selected_jailbreak_types))
]
attack_methods = get_evaluation_methods()
display_data = []
column_map = {
"GCG": "GCG",
"PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20",
"PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct",
"PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct"
}
for model in selected_models:
model_data = filtered_df[filtered_df['model_name'] == model]
if model_data.empty:
continue
row = [model]
all_scores = []
for attack in attack_methods:
col_name = column_map.get(attack)
if col_name and col_name in model_data.columns:
scores = pd.to_numeric(model_data[col_name], errors='coerce').dropna()
if len(scores) > 0:
avg_score = scores.mean()
row.append(format_data(avg_score))
all_scores.extend(scores.tolist())
else:
row.append(format_data(np.nan))
else:
row.append(format_data(np.nan))
overall_avg = np.mean(all_scores) if all_scores else np.nan
row.append(format_data(overall_avg))
display_data.append(row)
headers = ["Model"] + [f"{method} โฌ‡๏ธ" for method in attack_methods] + ["Overall Avg โฌ‡๏ธ"]
return create_styled_dataframe(headers, display_data, sort_by_col="Overall Avg โฌ‡๏ธ")
# Attack view: show performance of different models for selected attack methods
def filter_by_attack(selected_attacks, selected_evaluation_methods):
df = load_csv_data()
if not selected_attacks:
selected_attacks = get_attack_methods()
if isinstance(selected_evaluation_methods, str):
selected_evaluation_methods = [selected_evaluation_methods]
elif not selected_evaluation_methods:
selected_evaluation_methods = ["GCG"]
filtered_df = df[df['attack_method'].isin(selected_attacks)]
models = get_unique_models()
eval_column_map = {
"GCG": "GCG",
"PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20",
"PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct",
"PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct"
}
display_data = []
for model in models:
model_data = filtered_df[filtered_df['model_name'] == model]
if model_data.empty:
continue
row = [model]
all_attack_scores = []
for attack in selected_attacks:
attack_data = model_data[model_data['attack_method'] == attack]
if attack_data.empty:
row.append(format_data(np.nan))
continue
attack_scores = []
for eval_method in selected_evaluation_methods:
eval_column = eval_column_map.get(eval_method)
if eval_column and eval_column in attack_data.columns:
scores = pd.to_numeric(attack_data[eval_column], errors='coerce').dropna()
attack_scores.extend(scores.tolist())
avg_score = np.mean(attack_scores) if attack_scores else np.nan
row.append(format_data(avg_score))
if attack_scores:
all_attack_scores.extend(attack_scores)
overall_avg = np.mean(all_attack_scores) if all_attack_scores else np.nan
if np.isnan(overall_avg):
row.insert(1, None)
else:
row.insert(1, round(overall_avg, 2))
display_data.append(row)
eval_method_name = selected_evaluation_methods[0] if selected_evaluation_methods else "N/A"
headers = [f"Model ({eval_method_name})", "AVG โฌ‡๏ธ"] + [f"{attack} โฌ‡๏ธ" for attack in selected_attacks]
if display_data:
df_result = pd.DataFrame(display_data, columns=headers)
df_result["AVG โฌ‡๏ธ"] = pd.to_numeric(df_result["AVG โฌ‡๏ธ"], errors='coerce')
for col in df_result.columns[2:]:
df_result[col] = pd.to_numeric(df_result[col].astype(str).str.replace('N/A', ''), errors='coerce')
display_data = df_result.values.tolist()
return create_styled_dataframe(headers, display_data, sort_by_col="AVG โฌ‡๏ธ")
# Defense view: show performance of different defense methods
def filter_by_defense(selected_defenses, selected_evaluation_methods):
df = load_csv_data()
if not selected_defenses:
selected_defenses = get_defense_methods()
if isinstance(selected_evaluation_methods, str):
selected_evaluation_methods = [selected_evaluation_methods]
elif not selected_evaluation_methods:
selected_evaluation_methods = ["GCG"]
filtered_df = df[df['defense_method'].isin(selected_defenses)]
models = get_unique_models()
eval_column_map = {
"GCG": "GCG",
"PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20",
"PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct",
"PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct"
}
display_data = []
for model in models:
model_data = filtered_df[filtered_df['model_name'] == model]
if model_data.empty:
continue
row = [model]
all_defense_scores = []
for defense in selected_defenses:
defense_data = model_data[model_data['defense_method'] == defense]
if defense_data.empty:
row.append(format_data(np.nan))
continue
defense_scores = []
for eval_method in selected_evaluation_methods:
eval_column = eval_column_map.get(eval_method)
if eval_column and eval_column in defense_data.columns:
scores = pd.to_numeric(defense_data[eval_column], errors='coerce').dropna()
defense_scores.extend(scores.tolist())
avg_score = np.mean(defense_scores) if defense_scores else np.nan
row.append(format_data(avg_score))
if defense_scores:
all_defense_scores.extend(defense_scores)
overall_avg = np.mean(all_defense_scores) if all_defense_scores else np.nan
if np.isnan(overall_avg):
row.insert(1, None)
else:
row.insert(1, round(overall_avg, 2))
display_data.append(row)
eval_method_name = selected_evaluation_methods[0] if selected_evaluation_methods else "N/A"
headers = [f"Model ({eval_method_name})", "AVG โฌ‡๏ธ"] + [f"{defense} โฌ‡๏ธ" for defense in selected_defenses]
if display_data:
df_result = pd.DataFrame(display_data, columns=headers)
df_result["AVG โฌ‡๏ธ"] = pd.to_numeric(df_result["AVG โฌ‡๏ธ"], errors='coerce')
for col in df_result.columns[2:]:
df_result[col] = pd.to_numeric(df_result[col].astype(str).str.replace('N/A', ''), errors='coerce')
display_data = df_result.values.tolist()
return create_styled_dataframe(headers, display_data, sort_by_col="AVG โฌ‡๏ธ")
# Overview: comprehensive statistics
def filter_overview():
df = load_csv_data()
jailbreak_types = get_unique_jailbreak_types()
attack_methods = get_evaluation_methods()
display_data = []
column_map = {
"GCG": "GCG",
"PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20",
"PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct",
"PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct"
}
for jailbreak_type in jailbreak_types:
type_data = df[df['jailbreak_type'] == jailbreak_type]
if type_data.empty:
continue
row = [jailbreak_type]
all_scores = []
for attack in attack_methods:
col_name = column_map.get(attack)
if col_name and col_name in type_data.columns:
scores = pd.to_numeric(type_data[col_name], errors='coerce').dropna()
avg_score = scores.mean() if len(scores) > 0 else np.nan
all_scores.extend(scores.tolist())
else:
avg_score = np.nan
row.append(format_data(avg_score))
overall_avg = np.mean(all_scores) if all_scores else np.nan
row.append(format_data(overall_avg))
display_data.append(row)
headers = ["Jailbreak Type"] + [f"{method} Avg โฌ‡๏ธ" for method in attack_methods] + ["Overall Avg โฌ‡๏ธ"]
return headers, display_data
# Gradio Interface with comprehensive CSS targeting
with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
/* ็ฌฌไธ€ๅ - ๆต…้‡‘่‰ฒ่ƒŒๆ™ฏ */
table tbody tr:nth-child(1),
table tbody tr:nth-child(1) > *,
.dataframe tbody tr:nth-child(1),
.dataframe tbody tr:nth-child(1) > *,
div[data-testid="dataframe"] table tbody tr:nth-child(1),
div[data-testid="dataframe"] table tbody tr:nth-child(1) > * {
background-color: #FFF9E6 !important;
}
/* ็ฌฌไบŒๅ - ๆต…็ฐ่‰ฒ่ƒŒๆ™ฏ */
table tbody tr:nth-child(2),
table tbody tr:nth-child(2) > *,
.dataframe tbody tr:nth-child(2),
.dataframe tbody tr:nth-child(2) > *,
div[data-testid="dataframe"] table tbody tr:nth-child(2),
div[data-testid="dataframe"] table tbody tr:nth-child(2) > * {
background-color: #F5F5F5 !important;
}
/* ็ฌฌไธ‰ๅ - ๆต…ๆฉ™่‰ฒ่ƒŒๆ™ฏ */
table tbody tr:nth-child(3),
table tbody tr:nth-child(3) > *,
.dataframe tbody tr:nth-child(3),
.dataframe tbody tr:nth-child(3) > *,
div[data-testid="dataframe"] table tbody tr:nth-child(3),
div[data-testid="dataframe"] table tbody tr:nth-child(3) > * {
background-color: #FFF0E6 !important;
}
/* ๆ‚ฌๅœๆ•ˆๆžœ - ๅชๅฏน็ฌฌ4ๅๅŠไปฅๅŽ็”Ÿๆ•ˆ */
table tbody tr:nth-child(n+4):hover,
table tbody tr:nth-child(n+4):hover > *,
.dataframe tbody tr:nth-child(n+4):hover,
.dataframe tbody tr:nth-child(n+4):hover > * {
background-color: #E8F4F8 !important;
}
/* ๅ‰ไธ‰ๅๆ‚ฌๅœๆ—ถไฟๆŒๅŽŸ่ƒŒๆ™ฏ่‰ฒ */
table tbody tr:nth-child(1):hover,
table tbody tr:nth-child(1):hover > *,
.dataframe tbody tr:nth-child(1):hover,
.dataframe tbody tr:nth-child(1):hover > * {
background-color: #FFF9E6 !important;
}
table tbody tr:nth-child(2):hover,
table tbody tr:nth-child(2):hover > *,
.dataframe tbody tr:nth-child(2):hover,
.dataframe tbody tr:nth-child(2):hover > * {
background-color: #F5F5F5 !important;
}
table tbody tr:nth-child(3):hover,
table tbody tr:nth-child(3):hover > *,
.dataframe tbody tr:nth-child(3):hover,
.dataframe tbody tr:nth-child(3):hover > * {
background-color: #FFF0E6 !important;
}
""") as app:
# ้ข„ๅŠ ่ฝฝๆ•ฐๆฎ
print("Preloading data...")
load_csv_data()
print("Data preloaded successfully!")
gr.Markdown(
"""
# ๐Ÿ›ก๏ธ Jailbreak Attack Results Leaderboard
Analyze model performance against different jailbreak attacks across various categories.
Lower scores indicate better resistance to jailbreak attempts.
**๐Ÿฅ‡ Gold = 1st Place | ๐Ÿฅˆ Silver = 2nd Place | ๐Ÿฅ‰ Bronze = 3rd Place**
"""
)
with gr.Tabs():
with gr.Tab("โ„น๏ธ Information"):
gr.Markdown(
"""
## ๐Ÿ“– About This Leaderboard
This dashboard displays results from jailbreak attack experiments on various language models.
### ๐Ÿ† Ranking System:
- **๐Ÿฅ‡ 1st Place**: Best performing model (lowest score) - Light gold background
- **๐Ÿฅˆ 2nd Place**: Second best performing model - Light gray background
- **๐Ÿฅ‰ 3rd Place**: Third best performing model - Light orange background
### Usage:
- **Model View**: Compare how different models perform against various evaluation methods
- **Attack View**: Compare how different attacks perform against various models
- **Defense View**: Compare how different defense methods protect against various models
- **Jailbreak Type View**: Get overall statistics across all jailbreak types
### Model Icons:
Official logos from respective companies (mixed CDN strategy for optimal loading)
### Judgement Methods:
- **GCG**: Greedy Coordinate Gradient attack
- **PAIR_gpt-4o**: PAIR attack using GPT-4o
- **PAIR_Qwen**: PAIR attack using Qwen model
- **PAIR_meta-llama**: PAIR attack using Llama model
---
"""
)
with gr.Tab("๐Ÿค– Model View"):
gr.Markdown("### Compare how models perform against various evaluation methods")
with gr.Row():
model_select_all = gr.Button("โœ“ Select All Models", size="sm")
model_clear_all = gr.Button("โœ— Clear All Models", size="sm")
model_checkbox = gr.CheckboxGroup(
choices=get_unique_models(),
label="๐Ÿ“‹ Select Models",
value=get_unique_models()
)
with gr.Row():
jailbreak_select_all = gr.Button("โœ“ Select All Jailbreak Types", size="sm")
jailbreak_clear_all = gr.Button("โœ— Clear All Jailbreak Types", size="sm")
jailbreak_type_checkbox = gr.CheckboxGroup(
choices=get_unique_jailbreak_types(),
label="๐ŸŽฏ Select Jailbreak Types",
value=get_unique_jailbreak_types()
)
model_table = gr.Dataframe(interactive=False)
def update_model_view(models, jailbreak_types):
return filter_by_model(models, jailbreak_types)
# ๅ…จ้€‰/ๅ–ๆถˆๅ…จ้€‰ๆŒ‰้’ฎไบ‹ไปถ
model_select_all.click(
fn=lambda: get_unique_models(),
outputs=model_checkbox
)
model_clear_all.click(
fn=lambda: [],
outputs=model_checkbox
)
jailbreak_select_all.click(
fn=lambda: get_unique_jailbreak_types(),
outputs=jailbreak_type_checkbox
)
jailbreak_clear_all.click(
fn=lambda: [],
outputs=jailbreak_type_checkbox
)
for component in [model_checkbox, jailbreak_type_checkbox]:
component.change(
fn=update_model_view,
inputs=[model_checkbox, jailbreak_type_checkbox],
outputs=model_table
)
app.load(
fn=update_model_view,
inputs=[model_checkbox, jailbreak_type_checkbox],
outputs=model_table
)
with gr.Tab("โš”๏ธ Attack View"):
gr.Markdown("### Compare attack methods across different models")
with gr.Row():
attack_select_all = gr.Button("โœ“ Select All Attacks", size="sm")
attack_clear_all = gr.Button("โœ— Clear All Attacks", size="sm")
attack_checkbox = gr.CheckboxGroup(
choices=get_attack_methods(),
label="๐ŸŽฏ Select Attack Methods",
value=get_attack_methods()
)
evaluation_method_radio = gr.Radio(
choices=get_evaluation_methods(),
label="๐Ÿ“Š Select Evaluation Method",
value="GCG"
)
attack_table = gr.Dataframe(interactive=False)
def update_attack_view(attacks, eval_method):
return filter_by_attack(attacks, [eval_method])
# ๅ…จ้€‰/ๅ–ๆถˆๅ…จ้€‰ๆŒ‰้’ฎไบ‹ไปถ
attack_select_all.click(
fn=lambda: get_attack_methods(),
outputs=attack_checkbox
)
attack_clear_all.click(
fn=lambda: [],
outputs=attack_checkbox
)
for component in [attack_checkbox, evaluation_method_radio]:
component.change(
fn=update_attack_view,
inputs=[attack_checkbox, evaluation_method_radio],
outputs=attack_table
)
app.load(
fn=update_attack_view,
inputs=[attack_checkbox, evaluation_method_radio],
outputs=attack_table
)
with gr.Row():
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (rule-based GCG judge) ")
gr.Image(
value="./figs/GCG_attack_model.jpg",
interactive=True
)
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (gpt-4o-based PAIR judge)")
gr.Image(
value="./figs/attack_model_heatmap.jpg",
interactive=True
)
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (Llama-3.3-70B-based PAIR judge) ")
gr.Image(
value="./figs/PAIR_llama_attack_model.jpg",
interactive=True
)
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (Qwen2.5-72B-based PAIR judge)")
gr.Image(
value="./figs/PAIR_qwen_attack_model.jpg",
interactive=True
)
with gr.Tab("๐Ÿ›ก๏ธ Defense View"):
gr.Markdown("### Compare defense methods against different attacks")
with gr.Row():
defense_select_all = gr.Button("โœ“ Select All Defenses", size="sm")
defense_clear_all = gr.Button("โœ— Clear All Defenses", size="sm")
defense_checkbox = gr.CheckboxGroup(
choices=get_defense_methods(),
label="๐Ÿ›ก๏ธ Select Defense Methods",
value=get_defense_methods()
)
evaluation_method_radio_defense = gr.Radio(
choices=get_evaluation_methods(),
label="๐Ÿ“Š Select Evaluation Method",
value="GCG"
)
defense_table = gr.Dataframe(interactive=False)
def update_defense_view(defenses, eval_method):
return filter_by_defense(defenses, [eval_method])
# ๅ…จ้€‰/ๅ–ๆถˆๅ…จ้€‰ๆŒ‰้’ฎไบ‹ไปถ
defense_select_all.click(
fn=lambda: get_defense_methods(),
outputs=defense_checkbox
)
defense_clear_all.click(
fn=lambda: [],
outputs=defense_checkbox
)
for component in [defense_checkbox, evaluation_method_radio_defense]:
component.change(
fn=update_defense_view,
inputs=[defense_checkbox, evaluation_method_radio_defense],
outputs=defense_table
)
app.load(
fn=update_defense_view,
inputs=[defense_checkbox, evaluation_method_radio_defense],
outputs=defense_table
)
with gr.Row():
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (rule-based GCG judge) ")
gr.Image(
value="./figs/GCG_defense_model.jpg",
interactive=True
)
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (gpt-4o-based PAIR judge)")
gr.Image(
value="./figs/defense_model_heatmap.jpg",
interactive=True
)
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (Llama-3.3-70B-based PAIR judge) ")
gr.Image(
value="./figs/PAIR_llama_defense_model.jpg",
interactive=True
)
with gr.Column():
gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (Qwen2.5-72B-based PAIR judge)")
gr.Image(
value="./figs/PAIR_qwen_defense_model.jpg",
interactive=True
)
with gr.Tab("๐Ÿ“Š Jailbreak Type View"):
gr.Markdown("### Comprehensive statistics across all dimensions")
overview_table = gr.Dataframe(interactive=False)
def update_overview():
headers, data = filter_overview()
return gr.Dataframe(headers=headers, value=data, interactive=False)
app.load(
fn=update_overview,
outputs=overview_table
)
if __name__ == "__main__":
app.launch(share=False)