update
Browse files- app.py +139 -27
- figs/PAIR_llama_attack_model.jpg +3 -0
- figs/PAIR_llama_defense_model.jpg +3 -0
- figs/PAIR_qwen_attack_model.jpg +3 -0
- figs/PAIR_qwen_defense_model.jpg +3 -0
- figs/attack_model_heatmap.jpg +3 -0
- figs/defense_model_heatmap.jpg +3 -0
app.py
CHANGED
|
@@ -460,13 +460,35 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 460 |
background-color: #FFF0E6 !important;
|
| 461 |
}
|
| 462 |
|
| 463 |
-
/* ๆฌๅๆๆ */
|
| 464 |
-
table tbody tr:hover,
|
| 465 |
-
table tbody tr:hover > *,
|
| 466 |
-
.dataframe tbody tr:hover,
|
| 467 |
-
.dataframe tbody tr:hover > * {
|
| 468 |
background-color: #E8F4F8 !important;
|
| 469 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
""") as app:
|
| 471 |
# ้ขๅ ่ฝฝๆฐๆฎ
|
| 472 |
print("Preloading data...")
|
|
@@ -511,41 +533,27 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 511 |
- **PAIR_gpt-4o**: PAIR attack using GPT-4o
|
| 512 |
- **PAIR_Qwen**: PAIR attack using Qwen model
|
| 513 |
- **PAIR_meta-llama**: PAIR attack using Llama model
|
| 514 |
-
|
| 515 |
-
### Scoring:
|
| 516 |
-
Lower scores indicate better resistance to jailbreak attempts.
|
| 517 |
---
|
| 518 |
"""
|
| 519 |
)
|
| 520 |
|
| 521 |
-
with gr.Row():
|
| 522 |
-
with gr.Column():
|
| 523 |
-
gr.Markdown("### ๐ GCG Attack Model Visualization")
|
| 524 |
-
gr.Image(
|
| 525 |
-
value="./figs/GCG_attack_model.jpg",
|
| 526 |
-
label="GCG Attack Model",
|
| 527 |
-
show_label=False,
|
| 528 |
-
interactive=False
|
| 529 |
-
)
|
| 530 |
-
|
| 531 |
-
with gr.Column():
|
| 532 |
-
gr.Markdown("### ๐ GCG Defense Model Visualization")
|
| 533 |
-
gr.Image(
|
| 534 |
-
value="./figs/GCG_defense_model.jpg",
|
| 535 |
-
label="GCG Defense Model",
|
| 536 |
-
show_label=False,
|
| 537 |
-
interactive=False
|
| 538 |
-
)
|
| 539 |
-
|
| 540 |
with gr.Tab("๐ค Model View"):
|
| 541 |
gr.Markdown("### Compare how models perform against various evaluation methods")
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
model_checkbox = gr.CheckboxGroup(
|
| 544 |
choices=get_unique_models(),
|
| 545 |
label="๐ Select Models",
|
| 546 |
value=get_unique_models()
|
| 547 |
)
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
jailbreak_type_checkbox = gr.CheckboxGroup(
|
| 550 |
choices=get_unique_jailbreak_types(),
|
| 551 |
label="๐ฏ Select Jailbreak Types",
|
|
@@ -559,6 +567,24 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 559 |
return filter_by_model(models, jailbreak_types)
|
| 560 |
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
for component in [model_checkbox, jailbreak_type_checkbox]:
|
| 563 |
component.change(
|
| 564 |
fn=update_model_view,
|
|
@@ -575,6 +601,10 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 575 |
with gr.Tab("โ๏ธ Attack View"):
|
| 576 |
gr.Markdown("### Compare attack methods across different models")
|
| 577 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
attack_checkbox = gr.CheckboxGroup(
|
| 579 |
choices=get_attack_methods(),
|
| 580 |
label="๐ฏ Select Attack Methods",
|
|
@@ -594,6 +624,16 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 594 |
return filter_by_attack(attacks, [eval_method])
|
| 595 |
|
| 596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
for component in [attack_checkbox, evaluation_method_radio]:
|
| 598 |
component.change(
|
| 599 |
fn=update_attack_view,
|
|
@@ -607,9 +647,42 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 607 |
outputs=attack_table
|
| 608 |
)
|
| 609 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
with gr.Tab("๐ก๏ธ Defense View"):
|
| 611 |
gr.Markdown("### Compare defense methods against different attacks")
|
| 612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
defense_checkbox = gr.CheckboxGroup(
|
| 614 |
choices=get_defense_methods(),
|
| 615 |
label="๐ก๏ธ Select Defense Methods",
|
|
@@ -629,6 +702,16 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 629 |
return filter_by_defense(defenses, [eval_method])
|
| 630 |
|
| 631 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
for component in [defense_checkbox, evaluation_method_radio_defense]:
|
| 633 |
component.change(
|
| 634 |
fn=update_defense_view,
|
|
@@ -642,6 +725,35 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
|
|
| 642 |
outputs=defense_table
|
| 643 |
)
|
| 644 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
with gr.Tab("๐ Jailbreak Type View"):
|
| 646 |
gr.Markdown("### Comprehensive statistics across all dimensions")
|
| 647 |
|
|
|
|
| 460 |
background-color: #FFF0E6 !important;
|
| 461 |
}
|
| 462 |
|
| 463 |
+
/* ๆฌๅๆๆ - ๅชๅฏน็ฌฌ4ๅๅไปฅๅ็ๆ */
|
| 464 |
+
table tbody tr:nth-child(n+4):hover,
|
| 465 |
+
table tbody tr:nth-child(n+4):hover > *,
|
| 466 |
+
.dataframe tbody tr:nth-child(n+4):hover,
|
| 467 |
+
.dataframe tbody tr:nth-child(n+4):hover > * {
|
| 468 |
background-color: #E8F4F8 !important;
|
| 469 |
}
|
| 470 |
+
|
| 471 |
+
/* ๅไธๅๆฌๅๆถไฟๆๅ่ๆฏ่ฒ */
|
| 472 |
+
table tbody tr:nth-child(1):hover,
|
| 473 |
+
table tbody tr:nth-child(1):hover > *,
|
| 474 |
+
.dataframe tbody tr:nth-child(1):hover,
|
| 475 |
+
.dataframe tbody tr:nth-child(1):hover > * {
|
| 476 |
+
background-color: #FFF9E6 !important;
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
table tbody tr:nth-child(2):hover,
|
| 480 |
+
table tbody tr:nth-child(2):hover > *,
|
| 481 |
+
.dataframe tbody tr:nth-child(2):hover,
|
| 482 |
+
.dataframe tbody tr:nth-child(2):hover > * {
|
| 483 |
+
background-color: #F5F5F5 !important;
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
table tbody tr:nth-child(3):hover,
|
| 487 |
+
table tbody tr:nth-child(3):hover > *,
|
| 488 |
+
.dataframe tbody tr:nth-child(3):hover,
|
| 489 |
+
.dataframe tbody tr:nth-child(3):hover > * {
|
| 490 |
+
background-color: #FFF0E6 !important;
|
| 491 |
+
}
|
| 492 |
""") as app:
|
| 493 |
# ้ขๅ ่ฝฝๆฐๆฎ
|
| 494 |
print("Preloading data...")
|
|
|
|
| 533 |
- **PAIR_gpt-4o**: PAIR attack using GPT-4o
|
| 534 |
- **PAIR_Qwen**: PAIR attack using Qwen model
|
| 535 |
- **PAIR_meta-llama**: PAIR attack using Llama model
|
|
|
|
|
|
|
|
|
|
| 536 |
---
|
| 537 |
"""
|
| 538 |
)
|
| 539 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
with gr.Tab("๐ค Model View"):
|
| 541 |
gr.Markdown("### Compare how models perform against various evaluation methods")
|
| 542 |
|
| 543 |
+
with gr.Row():
|
| 544 |
+
model_select_all = gr.Button("โ Select All Models", size="sm")
|
| 545 |
+
model_clear_all = gr.Button("โ Clear All Models", size="sm")
|
| 546 |
+
|
| 547 |
model_checkbox = gr.CheckboxGroup(
|
| 548 |
choices=get_unique_models(),
|
| 549 |
label="๐ Select Models",
|
| 550 |
value=get_unique_models()
|
| 551 |
)
|
| 552 |
|
| 553 |
+
with gr.Row():
|
| 554 |
+
jailbreak_select_all = gr.Button("โ Select All Jailbreak Types", size="sm")
|
| 555 |
+
jailbreak_clear_all = gr.Button("โ Clear All Jailbreak Types", size="sm")
|
| 556 |
+
|
| 557 |
jailbreak_type_checkbox = gr.CheckboxGroup(
|
| 558 |
choices=get_unique_jailbreak_types(),
|
| 559 |
label="๐ฏ Select Jailbreak Types",
|
|
|
|
| 567 |
return filter_by_model(models, jailbreak_types)
|
| 568 |
|
| 569 |
|
| 570 |
+
# ๅ
จ้/ๅๆถๅ
จ้ๆ้ฎไบไปถ
|
| 571 |
+
model_select_all.click(
|
| 572 |
+
fn=lambda: get_unique_models(),
|
| 573 |
+
outputs=model_checkbox
|
| 574 |
+
)
|
| 575 |
+
model_clear_all.click(
|
| 576 |
+
fn=lambda: [],
|
| 577 |
+
outputs=model_checkbox
|
| 578 |
+
)
|
| 579 |
+
jailbreak_select_all.click(
|
| 580 |
+
fn=lambda: get_unique_jailbreak_types(),
|
| 581 |
+
outputs=jailbreak_type_checkbox
|
| 582 |
+
)
|
| 583 |
+
jailbreak_clear_all.click(
|
| 584 |
+
fn=lambda: [],
|
| 585 |
+
outputs=jailbreak_type_checkbox
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
for component in [model_checkbox, jailbreak_type_checkbox]:
|
| 589 |
component.change(
|
| 590 |
fn=update_model_view,
|
|
|
|
| 601 |
with gr.Tab("โ๏ธ Attack View"):
|
| 602 |
gr.Markdown("### Compare attack methods across different models")
|
| 603 |
|
| 604 |
+
with gr.Row():
|
| 605 |
+
attack_select_all = gr.Button("โ Select All Attacks", size="sm")
|
| 606 |
+
attack_clear_all = gr.Button("โ Clear All Attacks", size="sm")
|
| 607 |
+
|
| 608 |
attack_checkbox = gr.CheckboxGroup(
|
| 609 |
choices=get_attack_methods(),
|
| 610 |
label="๐ฏ Select Attack Methods",
|
|
|
|
| 624 |
return filter_by_attack(attacks, [eval_method])
|
| 625 |
|
| 626 |
|
| 627 |
+
# ๅ
จ้/ๅๆถๅ
จ้ๆ้ฎไบไปถ
|
| 628 |
+
attack_select_all.click(
|
| 629 |
+
fn=lambda: get_attack_methods(),
|
| 630 |
+
outputs=attack_checkbox
|
| 631 |
+
)
|
| 632 |
+
attack_clear_all.click(
|
| 633 |
+
fn=lambda: [],
|
| 634 |
+
outputs=attack_checkbox
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
for component in [attack_checkbox, evaluation_method_radio]:
|
| 638 |
component.change(
|
| 639 |
fn=update_attack_view,
|
|
|
|
| 647 |
outputs=attack_table
|
| 648 |
)
|
| 649 |
|
| 650 |
+
with gr.Row():
|
| 651 |
+
with gr.Column():
|
| 652 |
+
gr.Markdown("### ๐ Attack Model Visualization (rule-based GCG judge) ")
|
| 653 |
+
gr.Image(
|
| 654 |
+
value="./figs/GCG_attack_model.jpg",
|
| 655 |
+
interactive=True
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
with gr.Column():
|
| 659 |
+
gr.Markdown("### ๐ Attack Model Visualization (gpt-4o-based PAIR judge)")
|
| 660 |
+
gr.Image(
|
| 661 |
+
value="./figs/attack_model_heatmap.jpg",
|
| 662 |
+
interactive=True
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
with gr.Column():
|
| 666 |
+
gr.Markdown("### ๐ Attack Model Visualization (Llama-3.3-70B-based PAIR judge) ")
|
| 667 |
+
gr.Image(
|
| 668 |
+
value="./figs/PAIR_llama_attack_model.jpg",
|
| 669 |
+
interactive=True
|
| 670 |
+
)
|
| 671 |
+
|
| 672 |
+
with gr.Column():
|
| 673 |
+
gr.Markdown("### ๐ Attack Model Visualization (Qwen2.5-72B-based PAIR judge)")
|
| 674 |
+
gr.Image(
|
| 675 |
+
value="./figs/PAIR_qwen_attack_model.jpg",
|
| 676 |
+
interactive=True
|
| 677 |
+
)
|
| 678 |
+
|
| 679 |
with gr.Tab("๐ก๏ธ Defense View"):
|
| 680 |
gr.Markdown("### Compare defense methods against different attacks")
|
| 681 |
|
| 682 |
+
with gr.Row():
|
| 683 |
+
defense_select_all = gr.Button("โ Select All Defenses", size="sm")
|
| 684 |
+
defense_clear_all = gr.Button("โ Clear All Defenses", size="sm")
|
| 685 |
+
|
| 686 |
defense_checkbox = gr.CheckboxGroup(
|
| 687 |
choices=get_defense_methods(),
|
| 688 |
label="๐ก๏ธ Select Defense Methods",
|
|
|
|
| 702 |
return filter_by_defense(defenses, [eval_method])
|
| 703 |
|
| 704 |
|
| 705 |
+
# ๅ
จ้/ๅๆถๅ
จ้ๆ้ฎไบไปถ
|
| 706 |
+
defense_select_all.click(
|
| 707 |
+
fn=lambda: get_defense_methods(),
|
| 708 |
+
outputs=defense_checkbox
|
| 709 |
+
)
|
| 710 |
+
defense_clear_all.click(
|
| 711 |
+
fn=lambda: [],
|
| 712 |
+
outputs=defense_checkbox
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
for component in [defense_checkbox, evaluation_method_radio_defense]:
|
| 716 |
component.change(
|
| 717 |
fn=update_defense_view,
|
|
|
|
| 725 |
outputs=defense_table
|
| 726 |
)
|
| 727 |
|
| 728 |
+
with gr.Row():
|
| 729 |
+
with gr.Column():
|
| 730 |
+
gr.Markdown("### ๐ Defense Model Visualization (rule-based GCG judge) ")
|
| 731 |
+
gr.Image(
|
| 732 |
+
value="./figs/GCG_defense_model.jpg",
|
| 733 |
+
interactive=True
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
with gr.Column():
|
| 737 |
+
gr.Markdown("### ๐ Defense Model Visualization (gpt-4o-based PAIR judge)")
|
| 738 |
+
gr.Image(
|
| 739 |
+
value="./figs/defense_model_heatmap.jpg",
|
| 740 |
+
interactive=True
|
| 741 |
+
)
|
| 742 |
+
|
| 743 |
+
with gr.Column():
|
| 744 |
+
gr.Markdown("### ๐ Defense Model Visualization (Llama-3.3-70B-based PAIR judge) ")
|
| 745 |
+
gr.Image(
|
| 746 |
+
value="./figs/PAIR_llama_defense_model.jpg",
|
| 747 |
+
interactive=True
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
with gr.Column():
|
| 751 |
+
gr.Markdown("### ๐ Defense Model Visualization (Qwen2.5-72B-based PAIR judge)")
|
| 752 |
+
gr.Image(
|
| 753 |
+
value="./figs/PAIR_qwen_defense_model.jpg",
|
| 754 |
+
interactive=True
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
with gr.Tab("๐ Jailbreak Type View"):
|
| 758 |
gr.Markdown("### Comprehensive statistics across all dimensions")
|
| 759 |
|
figs/PAIR_llama_attack_model.jpg
ADDED
|
Git LFS Details
|
figs/PAIR_llama_defense_model.jpg
ADDED
|
Git LFS Details
|
figs/PAIR_qwen_attack_model.jpg
ADDED
|
Git LFS Details
|
figs/PAIR_qwen_defense_model.jpg
ADDED
|
Git LFS Details
|
figs/attack_model_heatmap.jpg
ADDED
|
Git LFS Details
|
figs/defense_model_heatmap.jpg
ADDED
|
Git LFS Details
|