| from huggingface_hub import hf_hub_download |
| import gradio as gr |
| import pandas as pd |
| import numpy as np |
|
|
| |
| _cached_df = None |
| _cached_models = None |
| _cached_jailbreak_types = None |
| _cached_attack_methods = None |
| _cached_defense_methods = None |
|
|
| |
| MODEL_ICON_URLS = { |
| 'claude': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/claude-color.png', |
| 'gpt': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/openai.png', |
| 'gemini': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/gemini-color.png', |
| 'grok': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/x-color.png', |
| 'llama': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/meta-color.png', |
| 'qwen': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/tongyi-color.png', |
| 'deepseek': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/deep-seek-color.png', |
| 'glm': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/zhipu-color.png', |
| 'doubao': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/doubao-color.png', |
| 'kimi': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/moonshot.png', |
| 'ernie': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/wenxin-color.png', |
| 'ds': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/deep-seek-color.png', |
| 'o3': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/openai.png', |
| 'gemma': 'https://registry.npmmirror.com/@lobehub/icons-static-png/1.0.0/files/dark/gemma-color.png', |
| 'phi': 'https://unpkg.com/@lobehub/icons-static-png@1.74.0/light/microsoft-color.png', |
| } |
|
|
|
|
| def get_model_icon_html(model_name): |
| """Get HTML image tag for model icon based on name""" |
| model_lower = model_name.lower() |
| icon_url = None |
|
|
| |
| if model_lower.startswith('ds-'): |
| icon_url = MODEL_ICON_URLS.get('deepseek') |
| else: |
| |
| for key, url in MODEL_ICON_URLS.items(): |
| if key in model_lower: |
| icon_url = url |
| break |
|
|
| if icon_url: |
| return f'<img src="{icon_url}" width="20" height="20" style="vertical-align: middle; margin-right: 8px;" onerror="this.style.display=\'none\'">' |
| return '' |
|
|
|
|
| def format_model_name_with_icon(model_name): |
| """Add icon to model name using HTML""" |
| icon_html = get_model_icon_html(model_name) |
| return f"{icon_html}{model_name}" |
|
|
|
|
| |
| def load_csv_data(): |
| global _cached_df |
| if _cached_df is None: |
| import gzip |
| try: |
| |
| with gzip.open("panda-bench.csv.gz", 'rt', encoding='utf-8') as f: |
| _cached_df = pd.read_csv(f) |
| except FileNotFoundError: |
| try: |
| |
| _cached_df = pd.read_csv("panda-bench.csv") |
| except FileNotFoundError: |
| |
| _cached_df = pd.DataFrame({ |
| 'model_name': ['claude-3-5-sonnet', 'gpt-4o', 'gemini-pro'], |
| 'attack_method': ['DEV_MODE_V2', 'DEV_MODE_V2', 'DEV_MODE_V2'], |
| 'jailbreak_type': ['Expert advice', 'Economic harm', 'Expert advice'], |
| 'GCG': [21.27, 40, 35], |
| 'defense_method': ['Paraphrase', 'SelfRemind', 'Paraphrase'] |
| }) |
| print(f"CSV loaded with {len(_cached_df)} rows") |
| return _cached_df |
|
|
|
|
| def get_unique_models(): |
| global _cached_models |
| if _cached_models is None: |
| df = load_csv_data() |
| _cached_models = sorted(df['model_name'].unique()) |
| return _cached_models |
|
|
|
|
| def get_unique_jailbreak_types(): |
| global _cached_jailbreak_types |
| if _cached_jailbreak_types is None: |
| df = load_csv_data() |
| _cached_jailbreak_types = sorted(df['jailbreak_type'].unique()) |
| return _cached_jailbreak_types |
|
|
|
|
| def get_attack_methods(): |
| global _cached_attack_methods |
| if _cached_attack_methods is None: |
| df = load_csv_data() |
| _cached_attack_methods = sorted(df['attack_method'].unique()) |
| return _cached_attack_methods |
|
|
|
|
| def get_evaluation_methods(): |
| return ["GCG", "PAIR_gpt-4o-2024-11-20", "PAIR_Qwen_Qwen2.5-72B-Instruct", "PAIR_meta-llama_Llama-3.3-70B-Instruct"] |
|
|
|
|
| def get_defense_methods(): |
| global _cached_defense_methods |
| if _cached_defense_methods is None: |
| df = load_csv_data() |
| _cached_defense_methods = sorted(df['defense_method'].unique()) |
| return _cached_defense_methods |
|
|
|
|
| def format_data(value): |
| if isinstance(value, (int, float)) and not np.isnan(value): |
| return round(value, 2) |
| return "N/A" |
|
|
|
|
| def create_styled_dataframe(headers, data, sort_by_col=None): |
| """Create a styled dataframe with color highlighting for top 3""" |
| if not data: |
| return gr.Dataframe(headers=headers, value=data, interactive=False, |
| datatype=["html"] + ["number"] * (len(headers) - 1)) |
|
|
| df = pd.DataFrame(data, columns=headers) |
|
|
| |
| if sort_by_col and sort_by_col in df.columns: |
| |
| df[sort_by_col] = pd.to_numeric(df[sort_by_col], errors='coerce') |
| df = df.sort_values(by=sort_by_col, ascending=True) |
| df = df.reset_index(drop=True) |
|
|
| |
| rank_column = [] |
| for i in range(len(df)): |
| if i == 0: |
| rank_column.append("๐ฅ 1") |
| elif i == 1: |
| rank_column.append("๐ฅ 2") |
| elif i == 2: |
| rank_column.append("๐ฅ 3") |
| else: |
| rank_column.append(str(i + 1)) |
|
|
| |
| df.insert(0, 'Rank', rank_column) |
| headers = ['Rank'] + headers |
|
|
| |
| model_col_idx = 1 if 'Rank' in headers else 0 |
| if 'Model' in headers[model_col_idx]: |
| df.iloc[:, model_col_idx] = df.iloc[:, model_col_idx].apply(format_model_name_with_icon) |
|
|
| |
| styled_data = df.values.tolist() |
|
|
| |
| if 'Rank' in headers: |
| datatypes = ["html", "html"] + ["number"] * (len(headers) - 2) |
| else: |
| datatypes = ["html"] + ["number"] * (len(headers) - 1) |
|
|
| return gr.Dataframe( |
| headers=headers, |
| value=styled_data, |
| interactive=False, |
| datatype=datatypes, |
| wrap=True |
| ) |
|
|
|
|
| |
| def filter_by_model(selected_models, selected_jailbreak_types): |
| df = load_csv_data() |
|
|
| if not selected_models: |
| selected_models = get_unique_models() |
| if not selected_jailbreak_types: |
| selected_jailbreak_types = get_unique_jailbreak_types() |
|
|
| filtered_df = df[ |
| (df['model_name'].isin(selected_models)) & |
| (df['jailbreak_type'].isin(selected_jailbreak_types)) |
| ] |
|
|
| attack_methods = get_evaluation_methods() |
| display_data = [] |
|
|
| column_map = { |
| "GCG": "GCG", |
| "PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20", |
| "PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct", |
| "PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct" |
| } |
|
|
| for model in selected_models: |
| model_data = filtered_df[filtered_df['model_name'] == model] |
|
|
| if model_data.empty: |
| continue |
|
|
| row = [model] |
| all_scores = [] |
|
|
| for attack in attack_methods: |
| col_name = column_map.get(attack) |
| if col_name and col_name in model_data.columns: |
| scores = pd.to_numeric(model_data[col_name], errors='coerce').dropna() |
| if len(scores) > 0: |
| avg_score = scores.mean() |
| row.append(format_data(avg_score)) |
| all_scores.extend(scores.tolist()) |
| else: |
| row.append(format_data(np.nan)) |
| else: |
| row.append(format_data(np.nan)) |
|
|
| overall_avg = np.mean(all_scores) if all_scores else np.nan |
| row.append(format_data(overall_avg)) |
|
|
| display_data.append(row) |
|
|
| headers = ["Model"] + [f"{method} โฌ๏ธ" for method in attack_methods] + ["Overall Avg โฌ๏ธ"] |
|
|
| return create_styled_dataframe(headers, display_data, sort_by_col="Overall Avg โฌ๏ธ") |
|
|
|
|
| |
| def filter_by_attack(selected_attacks, selected_evaluation_methods): |
| df = load_csv_data() |
|
|
| if not selected_attacks: |
| selected_attacks = get_attack_methods() |
|
|
| if isinstance(selected_evaluation_methods, str): |
| selected_evaluation_methods = [selected_evaluation_methods] |
| elif not selected_evaluation_methods: |
| selected_evaluation_methods = ["GCG"] |
|
|
| filtered_df = df[df['attack_method'].isin(selected_attacks)] |
| models = get_unique_models() |
|
|
| eval_column_map = { |
| "GCG": "GCG", |
| "PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20", |
| "PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct", |
| "PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct" |
| } |
|
|
| display_data = [] |
|
|
| for model in models: |
| model_data = filtered_df[filtered_df['model_name'] == model] |
| if model_data.empty: |
| continue |
|
|
| row = [model] |
| all_attack_scores = [] |
|
|
| for attack in selected_attacks: |
| attack_data = model_data[model_data['attack_method'] == attack] |
|
|
| if attack_data.empty: |
| row.append(format_data(np.nan)) |
| continue |
|
|
| attack_scores = [] |
| for eval_method in selected_evaluation_methods: |
| eval_column = eval_column_map.get(eval_method) |
| if eval_column and eval_column in attack_data.columns: |
| scores = pd.to_numeric(attack_data[eval_column], errors='coerce').dropna() |
| attack_scores.extend(scores.tolist()) |
|
|
| avg_score = np.mean(attack_scores) if attack_scores else np.nan |
| row.append(format_data(avg_score)) |
|
|
| if attack_scores: |
| all_attack_scores.extend(attack_scores) |
|
|
| overall_avg = np.mean(all_attack_scores) if all_attack_scores else np.nan |
|
|
| if np.isnan(overall_avg): |
| row.insert(1, None) |
| else: |
| row.insert(1, round(overall_avg, 2)) |
|
|
| display_data.append(row) |
|
|
| eval_method_name = selected_evaluation_methods[0] if selected_evaluation_methods else "N/A" |
| headers = [f"Model ({eval_method_name})", "AVG โฌ๏ธ"] + [f"{attack} โฌ๏ธ" for attack in selected_attacks] |
|
|
| if display_data: |
| df_result = pd.DataFrame(display_data, columns=headers) |
| df_result["AVG โฌ๏ธ"] = pd.to_numeric(df_result["AVG โฌ๏ธ"], errors='coerce') |
|
|
| for col in df_result.columns[2:]: |
| df_result[col] = pd.to_numeric(df_result[col].astype(str).str.replace('N/A', ''), errors='coerce') |
|
|
| display_data = df_result.values.tolist() |
|
|
| return create_styled_dataframe(headers, display_data, sort_by_col="AVG โฌ๏ธ") |
|
|
|
|
| |
| def filter_by_defense(selected_defenses, selected_evaluation_methods): |
| df = load_csv_data() |
|
|
| if not selected_defenses: |
| selected_defenses = get_defense_methods() |
|
|
| if isinstance(selected_evaluation_methods, str): |
| selected_evaluation_methods = [selected_evaluation_methods] |
| elif not selected_evaluation_methods: |
| selected_evaluation_methods = ["GCG"] |
|
|
| filtered_df = df[df['defense_method'].isin(selected_defenses)] |
| models = get_unique_models() |
|
|
| eval_column_map = { |
| "GCG": "GCG", |
| "PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20", |
| "PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct", |
| "PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct" |
| } |
|
|
| display_data = [] |
|
|
| for model in models: |
| model_data = filtered_df[filtered_df['model_name'] == model] |
| if model_data.empty: |
| continue |
|
|
| row = [model] |
| all_defense_scores = [] |
|
|
| for defense in selected_defenses: |
| defense_data = model_data[model_data['defense_method'] == defense] |
|
|
| if defense_data.empty: |
| row.append(format_data(np.nan)) |
| continue |
|
|
| defense_scores = [] |
| for eval_method in selected_evaluation_methods: |
| eval_column = eval_column_map.get(eval_method) |
| if eval_column and eval_column in defense_data.columns: |
| scores = pd.to_numeric(defense_data[eval_column], errors='coerce').dropna() |
| defense_scores.extend(scores.tolist()) |
|
|
| avg_score = np.mean(defense_scores) if defense_scores else np.nan |
| row.append(format_data(avg_score)) |
|
|
| if defense_scores: |
| all_defense_scores.extend(defense_scores) |
|
|
| overall_avg = np.mean(all_defense_scores) if all_defense_scores else np.nan |
|
|
| if np.isnan(overall_avg): |
| row.insert(1, None) |
| else: |
| row.insert(1, round(overall_avg, 2)) |
|
|
| display_data.append(row) |
|
|
| eval_method_name = selected_evaluation_methods[0] if selected_evaluation_methods else "N/A" |
| headers = [f"Model ({eval_method_name})", "AVG โฌ๏ธ"] + [f"{defense} โฌ๏ธ" for defense in selected_defenses] |
|
|
| if display_data: |
| df_result = pd.DataFrame(display_data, columns=headers) |
| df_result["AVG โฌ๏ธ"] = pd.to_numeric(df_result["AVG โฌ๏ธ"], errors='coerce') |
|
|
| for col in df_result.columns[2:]: |
| df_result[col] = pd.to_numeric(df_result[col].astype(str).str.replace('N/A', ''), errors='coerce') |
|
|
| display_data = df_result.values.tolist() |
|
|
| return create_styled_dataframe(headers, display_data, sort_by_col="AVG โฌ๏ธ") |
|
|
|
|
| |
| def filter_overview(): |
| df = load_csv_data() |
|
|
| jailbreak_types = get_unique_jailbreak_types() |
| attack_methods = get_evaluation_methods() |
| display_data = [] |
|
|
| column_map = { |
| "GCG": "GCG", |
| "PAIR_gpt-4o-2024-11-20": "PAIR_gpt-4o-2024-11-20", |
| "PAIR_Qwen_Qwen2.5-72B-Instruct": "PAIR_Qwen_Qwen2.5-72B-Instruct", |
| "PAIR_meta-llama_Llama-3.3-70B-Instruct": "PAIR_meta-llama_Llama-3.3-70B-Instruct" |
| } |
|
|
| for jailbreak_type in jailbreak_types: |
| type_data = df[df['jailbreak_type'] == jailbreak_type] |
| if type_data.empty: |
| continue |
|
|
| row = [jailbreak_type] |
|
|
| all_scores = [] |
| for attack in attack_methods: |
| col_name = column_map.get(attack) |
| if col_name and col_name in type_data.columns: |
| scores = pd.to_numeric(type_data[col_name], errors='coerce').dropna() |
| avg_score = scores.mean() if len(scores) > 0 else np.nan |
| all_scores.extend(scores.tolist()) |
| else: |
| avg_score = np.nan |
| row.append(format_data(avg_score)) |
|
|
| overall_avg = np.mean(all_scores) if all_scores else np.nan |
| row.append(format_data(overall_avg)) |
|
|
| display_data.append(row) |
|
|
| headers = ["Jailbreak Type"] + [f"{method} Avg โฌ๏ธ" for method in attack_methods] + ["Overall Avg โฌ๏ธ"] |
| return headers, display_data |
|
|
|
|
| |
| with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css=""" |
| /* ็ฌฌไธๅ - ๆต
้่ฒ่ๆฏ */ |
| table tbody tr:nth-child(1), |
| table tbody tr:nth-child(1) > *, |
| .dataframe tbody tr:nth-child(1), |
| .dataframe tbody tr:nth-child(1) > *, |
| div[data-testid="dataframe"] table tbody tr:nth-child(1), |
| div[data-testid="dataframe"] table tbody tr:nth-child(1) > * { |
| background-color: #FFF9E6 !important; |
| } |
| |
| /* ็ฌฌไบๅ - ๆต
็ฐ่ฒ่ๆฏ */ |
| table tbody tr:nth-child(2), |
| table tbody tr:nth-child(2) > *, |
| .dataframe tbody tr:nth-child(2), |
| .dataframe tbody tr:nth-child(2) > *, |
| div[data-testid="dataframe"] table tbody tr:nth-child(2), |
| div[data-testid="dataframe"] table tbody tr:nth-child(2) > * { |
| background-color: #F5F5F5 !important; |
| } |
| |
| /* ็ฌฌไธๅ - ๆต
ๆฉ่ฒ่ๆฏ */ |
| table tbody tr:nth-child(3), |
| table tbody tr:nth-child(3) > *, |
| .dataframe tbody tr:nth-child(3), |
| .dataframe tbody tr:nth-child(3) > *, |
| div[data-testid="dataframe"] table tbody tr:nth-child(3), |
| div[data-testid="dataframe"] table tbody tr:nth-child(3) > * { |
| background-color: #FFF0E6 !important; |
| } |
| |
| /* ๆฌๅๆๆ - ๅชๅฏน็ฌฌ4ๅๅไปฅๅ็ๆ */ |
| table tbody tr:nth-child(n+4):hover, |
| table tbody tr:nth-child(n+4):hover > *, |
| .dataframe tbody tr:nth-child(n+4):hover, |
| .dataframe tbody tr:nth-child(n+4):hover > * { |
| background-color: #E8F4F8 !important; |
| } |
| |
| /* ๅไธๅๆฌๅๆถไฟๆๅ่ๆฏ่ฒ */ |
| table tbody tr:nth-child(1):hover, |
| table tbody tr:nth-child(1):hover > *, |
| .dataframe tbody tr:nth-child(1):hover, |
| .dataframe tbody tr:nth-child(1):hover > * { |
| background-color: #FFF9E6 !important; |
| } |
| |
| table tbody tr:nth-child(2):hover, |
| table tbody tr:nth-child(2):hover > *, |
| .dataframe tbody tr:nth-child(2):hover, |
| .dataframe tbody tr:nth-child(2):hover > * { |
| background-color: #F5F5F5 !important; |
| } |
| |
| table tbody tr:nth-child(3):hover, |
| table tbody tr:nth-child(3):hover > *, |
| .dataframe tbody tr:nth-child(3):hover, |
| .dataframe tbody tr:nth-child(3):hover > * { |
| background-color: #FFF0E6 !important; |
| } |
| """) as app: |
| |
| print("Preloading data...") |
| load_csv_data() |
| print("Data preloaded successfully!") |
|
|
| gr.Markdown( |
| """ |
| # ๐ก๏ธ Jailbreak Attack Results Leaderboard |
| |
| Analyze model performance against different jailbreak attacks across various categories. |
| Lower scores indicate better resistance to jailbreak attempts. |
| |
| **๐ฅ Gold = 1st Place | ๐ฅ Silver = 2nd Place | ๐ฅ Bronze = 3rd Place** |
| """ |
| ) |
|
|
| with gr.Tabs(): |
| with gr.Tab("โน๏ธ Information"): |
| gr.Markdown( |
| """ |
| ## ๐ About This Leaderboard |
| |
| This dashboard displays results from jailbreak attack experiments on various language models. |
| |
| ### ๐ Ranking System: |
| - **๐ฅ 1st Place**: Best performing model (lowest score) - Light gold background |
| - **๐ฅ 2nd Place**: Second best performing model - Light gray background |
| - **๐ฅ 3rd Place**: Third best performing model - Light orange background |
| |
| ### Usage: |
| - **Model View**: Compare how different models perform against various evaluation methods |
| - **Attack View**: Compare how different attacks perform against various models |
| - **Defense View**: Compare how different defense methods protect against various models |
| - **Jailbreak Type View**: Get overall statistics across all jailbreak types |
| |
| ### Model Icons: |
| Official logos from respective companies (mixed CDN strategy for optimal loading) |
| |
| ### Judgement Methods: |
| - **GCG**: Greedy Coordinate Gradient attack |
| - **PAIR_gpt-4o**: PAIR attack using GPT-4o |
| - **PAIR_Qwen**: PAIR attack using Qwen model |
| - **PAIR_meta-llama**: PAIR attack using Llama model |
| --- |
| """ |
| ) |
|
|
| with gr.Tab("๐ค Model View"): |
| gr.Markdown("### Compare how models perform against various evaluation methods") |
|
|
| with gr.Row(): |
| model_select_all = gr.Button("โ Select All Models", size="sm") |
| model_clear_all = gr.Button("โ Clear All Models", size="sm") |
|
|
| model_checkbox = gr.CheckboxGroup( |
| choices=get_unique_models(), |
| label="๐ Select Models", |
| value=get_unique_models() |
| ) |
|
|
| with gr.Row(): |
| jailbreak_select_all = gr.Button("โ Select All Jailbreak Types", size="sm") |
| jailbreak_clear_all = gr.Button("โ Clear All Jailbreak Types", size="sm") |
|
|
| jailbreak_type_checkbox = gr.CheckboxGroup( |
| choices=get_unique_jailbreak_types(), |
| label="๐ฏ Select Jailbreak Types", |
| value=get_unique_jailbreak_types() |
| ) |
|
|
| model_table = gr.Dataframe(interactive=False) |
|
|
|
|
| def update_model_view(models, jailbreak_types): |
| return filter_by_model(models, jailbreak_types) |
|
|
|
|
| |
| model_select_all.click( |
| fn=lambda: get_unique_models(), |
| outputs=model_checkbox |
| ) |
| model_clear_all.click( |
| fn=lambda: [], |
| outputs=model_checkbox |
| ) |
| jailbreak_select_all.click( |
| fn=lambda: get_unique_jailbreak_types(), |
| outputs=jailbreak_type_checkbox |
| ) |
| jailbreak_clear_all.click( |
| fn=lambda: [], |
| outputs=jailbreak_type_checkbox |
| ) |
|
|
| for component in [model_checkbox, jailbreak_type_checkbox]: |
| component.change( |
| fn=update_model_view, |
| inputs=[model_checkbox, jailbreak_type_checkbox], |
| outputs=model_table |
| ) |
|
|
| app.load( |
| fn=update_model_view, |
| inputs=[model_checkbox, jailbreak_type_checkbox], |
| outputs=model_table |
| ) |
|
|
| with gr.Tab("โ๏ธ Attack View"): |
| gr.Markdown("### Compare attack methods across different models") |
|
|
| with gr.Row(): |
| attack_select_all = gr.Button("โ Select All Attacks", size="sm") |
| attack_clear_all = gr.Button("โ Clear All Attacks", size="sm") |
|
|
| attack_checkbox = gr.CheckboxGroup( |
| choices=get_attack_methods(), |
| label="๐ฏ Select Attack Methods", |
| value=get_attack_methods() |
| ) |
|
|
| evaluation_method_radio = gr.Radio( |
| choices=get_evaluation_methods(), |
| label="๐ Select Evaluation Method", |
| value="GCG" |
| ) |
|
|
| attack_table = gr.Dataframe(interactive=False) |
|
|
|
|
| def update_attack_view(attacks, eval_method): |
| return filter_by_attack(attacks, [eval_method]) |
|
|
|
|
| |
| attack_select_all.click( |
| fn=lambda: get_attack_methods(), |
| outputs=attack_checkbox |
| ) |
| attack_clear_all.click( |
| fn=lambda: [], |
| outputs=attack_checkbox |
| ) |
|
|
| for component in [attack_checkbox, evaluation_method_radio]: |
| component.change( |
| fn=update_attack_view, |
| inputs=[attack_checkbox, evaluation_method_radio], |
| outputs=attack_table |
| ) |
|
|
| app.load( |
| fn=update_attack_view, |
| inputs=[attack_checkbox, evaluation_method_radio], |
| outputs=attack_table |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### ๐ Attack Model Visualization (rule-based GCG judge) ") |
| gr.Image( |
| value="./figs/GCG_attack_model.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown("### ๐ Attack Model Visualization (gpt-4o-based PAIR judge)") |
| gr.Image( |
| value="./figs/attack_model_heatmap.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown("### ๐ Attack Model Visualization (Llama-3.3-70B-based PAIR judge) ") |
| gr.Image( |
| value="./figs/PAIR_llama_attack_model.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown("### ๐ Attack Model Visualization (Qwen2.5-72B-based PAIR judge)") |
| gr.Image( |
| value="./figs/PAIR_qwen_attack_model.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Tab("๐ก๏ธ Defense View"): |
| gr.Markdown("### Compare defense methods against different attacks") |
|
|
| with gr.Row(): |
| defense_select_all = gr.Button("โ Select All Defenses", size="sm") |
| defense_clear_all = gr.Button("โ Clear All Defenses", size="sm") |
|
|
| defense_checkbox = gr.CheckboxGroup( |
| choices=get_defense_methods(), |
| label="๐ก๏ธ Select Defense Methods", |
| value=get_defense_methods() |
| ) |
|
|
| evaluation_method_radio_defense = gr.Radio( |
| choices=get_evaluation_methods(), |
| label="๐ Select Evaluation Method", |
| value="GCG" |
| ) |
|
|
| defense_table = gr.Dataframe(interactive=False) |
|
|
|
|
| def update_defense_view(defenses, eval_method): |
| return filter_by_defense(defenses, [eval_method]) |
|
|
|
|
| |
| defense_select_all.click( |
| fn=lambda: get_defense_methods(), |
| outputs=defense_checkbox |
| ) |
| defense_clear_all.click( |
| fn=lambda: [], |
| outputs=defense_checkbox |
| ) |
|
|
| for component in [defense_checkbox, evaluation_method_radio_defense]: |
| component.change( |
| fn=update_defense_view, |
| inputs=[defense_checkbox, evaluation_method_radio_defense], |
| outputs=defense_table |
| ) |
|
|
| app.load( |
| fn=update_defense_view, |
| inputs=[defense_checkbox, evaluation_method_radio_defense], |
| outputs=defense_table |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### ๐ Defense Model Visualization (rule-based GCG judge) ") |
| gr.Image( |
| value="./figs/GCG_defense_model.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown("### ๐ Defense Model Visualization (gpt-4o-based PAIR judge)") |
| gr.Image( |
| value="./figs/defense_model_heatmap.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown("### ๐ Defense Model Visualization (Llama-3.3-70B-based PAIR judge) ") |
| gr.Image( |
| value="./figs/PAIR_llama_defense_model.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown("### ๐ Defense Model Visualization (Qwen2.5-72B-based PAIR judge)") |
| gr.Image( |
| value="./figs/PAIR_qwen_defense_model.jpg", |
| interactive=True |
| ) |
|
|
| with gr.Tab("๐ Jailbreak Type View"): |
| gr.Markdown("### Comprehensive statistics across all dimensions") |
|
|
| overview_table = gr.Dataframe(interactive=False) |
|
|
|
|
| def update_overview(): |
| headers, data = filter_overview() |
| return gr.Dataframe(headers=headers, value=data, interactive=False) |
|
|
|
|
| app.load( |
| fn=update_overview, |
| outputs=overview_table |
| ) |
|
|
| if __name__ == "__main__": |
| app.launch(share=False) |