xianghe commited on
Commit
96efaf9
ยท
1 Parent(s): fb82abb
app.py CHANGED
@@ -460,13 +460,35 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
460
  background-color: #FFF0E6 !important;
461
  }
462
 
463
- /* ๆ‚ฌๅœๆ•ˆๆžœ */
464
- table tbody tr:hover,
465
- table tbody tr:hover > *,
466
- .dataframe tbody tr:hover,
467
- .dataframe tbody tr:hover > * {
468
  background-color: #E8F4F8 !important;
469
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  """) as app:
471
  # ้ข„ๅŠ ่ฝฝๆ•ฐๆฎ
472
  print("Preloading data...")
@@ -511,41 +533,27 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
511
  - **PAIR_gpt-4o**: PAIR attack using GPT-4o
512
  - **PAIR_Qwen**: PAIR attack using Qwen model
513
  - **PAIR_meta-llama**: PAIR attack using Llama model
514
-
515
- ### Scoring:
516
- Lower scores indicate better resistance to jailbreak attempts.
517
  ---
518
  """
519
  )
520
 
521
- with gr.Row():
522
- with gr.Column():
523
- gr.Markdown("### ๐Ÿ“ˆ GCG Attack Model Visualization")
524
- gr.Image(
525
- value="./figs/GCG_attack_model.jpg",
526
- label="GCG Attack Model",
527
- show_label=False,
528
- interactive=False
529
- )
530
-
531
- with gr.Column():
532
- gr.Markdown("### ๐Ÿ“ˆ GCG Defense Model Visualization")
533
- gr.Image(
534
- value="./figs/GCG_defense_model.jpg",
535
- label="GCG Defense Model",
536
- show_label=False,
537
- interactive=False
538
- )
539
-
540
  with gr.Tab("๐Ÿค– Model View"):
541
  gr.Markdown("### Compare how models perform against various evaluation methods")
542
 
 
 
 
 
543
  model_checkbox = gr.CheckboxGroup(
544
  choices=get_unique_models(),
545
  label="๐Ÿ“‹ Select Models",
546
  value=get_unique_models()
547
  )
548
 
 
 
 
 
549
  jailbreak_type_checkbox = gr.CheckboxGroup(
550
  choices=get_unique_jailbreak_types(),
551
  label="๐ŸŽฏ Select Jailbreak Types",
@@ -559,6 +567,24 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
559
  return filter_by_model(models, jailbreak_types)
560
 
561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  for component in [model_checkbox, jailbreak_type_checkbox]:
563
  component.change(
564
  fn=update_model_view,
@@ -575,6 +601,10 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
575
  with gr.Tab("โš”๏ธ Attack View"):
576
  gr.Markdown("### Compare attack methods across different models")
577
 
 
 
 
 
578
  attack_checkbox = gr.CheckboxGroup(
579
  choices=get_attack_methods(),
580
  label="๐ŸŽฏ Select Attack Methods",
@@ -594,6 +624,16 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
594
  return filter_by_attack(attacks, [eval_method])
595
 
596
 
 
 
 
 
 
 
 
 
 
 
597
  for component in [attack_checkbox, evaluation_method_radio]:
598
  component.change(
599
  fn=update_attack_view,
@@ -607,9 +647,42 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
607
  outputs=attack_table
608
  )
609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  with gr.Tab("๐Ÿ›ก๏ธ Defense View"):
611
  gr.Markdown("### Compare defense methods against different attacks")
612
 
 
 
 
 
613
  defense_checkbox = gr.CheckboxGroup(
614
  choices=get_defense_methods(),
615
  label="๐Ÿ›ก๏ธ Select Defense Methods",
@@ -629,6 +702,16 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
629
  return filter_by_defense(defenses, [eval_method])
630
 
631
 
 
 
 
 
 
 
 
 
 
 
632
  for component in [defense_checkbox, evaluation_method_radio_defense]:
633
  component.change(
634
  fn=update_defense_view,
@@ -642,6 +725,35 @@ with gr.Blocks(title="Jailbreak Attack Results Leaderboard", css="""
642
  outputs=defense_table
643
  )
644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  with gr.Tab("๐Ÿ“Š Jailbreak Type View"):
646
  gr.Markdown("### Comprehensive statistics across all dimensions")
647
 
 
460
  background-color: #FFF0E6 !important;
461
  }
462
 
463
+ /* ๆ‚ฌๅœๆ•ˆๆžœ - ๅชๅฏน็ฌฌ4ๅๅŠไปฅๅŽ็”Ÿๆ•ˆ */
464
+ table tbody tr:nth-child(n+4):hover,
465
+ table tbody tr:nth-child(n+4):hover > *,
466
+ .dataframe tbody tr:nth-child(n+4):hover,
467
+ .dataframe tbody tr:nth-child(n+4):hover > * {
468
  background-color: #E8F4F8 !important;
469
  }
470
+
471
+ /* ๅ‰ไธ‰ๅๆ‚ฌๅœๆ—ถไฟๆŒๅŽŸ่ƒŒๆ™ฏ่‰ฒ */
472
+ table tbody tr:nth-child(1):hover,
473
+ table tbody tr:nth-child(1):hover > *,
474
+ .dataframe tbody tr:nth-child(1):hover,
475
+ .dataframe tbody tr:nth-child(1):hover > * {
476
+ background-color: #FFF9E6 !important;
477
+ }
478
+
479
+ table tbody tr:nth-child(2):hover,
480
+ table tbody tr:nth-child(2):hover > *,
481
+ .dataframe tbody tr:nth-child(2):hover,
482
+ .dataframe tbody tr:nth-child(2):hover > * {
483
+ background-color: #F5F5F5 !important;
484
+ }
485
+
486
+ table tbody tr:nth-child(3):hover,
487
+ table tbody tr:nth-child(3):hover > *,
488
+ .dataframe tbody tr:nth-child(3):hover,
489
+ .dataframe tbody tr:nth-child(3):hover > * {
490
+ background-color: #FFF0E6 !important;
491
+ }
492
  """) as app:
493
  # ้ข„ๅŠ ่ฝฝๆ•ฐๆฎ
494
  print("Preloading data...")
 
533
  - **PAIR_gpt-4o**: PAIR attack using GPT-4o
534
  - **PAIR_Qwen**: PAIR attack using Qwen model
535
  - **PAIR_meta-llama**: PAIR attack using Llama model
 
 
 
536
  ---
537
  """
538
  )
539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  with gr.Tab("๐Ÿค– Model View"):
541
  gr.Markdown("### Compare how models perform against various evaluation methods")
542
 
543
+ with gr.Row():
544
+ model_select_all = gr.Button("โœ“ Select All Models", size="sm")
545
+ model_clear_all = gr.Button("โœ— Clear All Models", size="sm")
546
+
547
  model_checkbox = gr.CheckboxGroup(
548
  choices=get_unique_models(),
549
  label="๐Ÿ“‹ Select Models",
550
  value=get_unique_models()
551
  )
552
 
553
+ with gr.Row():
554
+ jailbreak_select_all = gr.Button("โœ“ Select All Jailbreak Types", size="sm")
555
+ jailbreak_clear_all = gr.Button("โœ— Clear All Jailbreak Types", size="sm")
556
+
557
  jailbreak_type_checkbox = gr.CheckboxGroup(
558
  choices=get_unique_jailbreak_types(),
559
  label="๐ŸŽฏ Select Jailbreak Types",
 
567
  return filter_by_model(models, jailbreak_types)
568
 
569
 
570
+ # ๅ…จ้€‰/ๅ–ๆถˆๅ…จ้€‰ๆŒ‰้’ฎไบ‹ไปถ
571
+ model_select_all.click(
572
+ fn=lambda: get_unique_models(),
573
+ outputs=model_checkbox
574
+ )
575
+ model_clear_all.click(
576
+ fn=lambda: [],
577
+ outputs=model_checkbox
578
+ )
579
+ jailbreak_select_all.click(
580
+ fn=lambda: get_unique_jailbreak_types(),
581
+ outputs=jailbreak_type_checkbox
582
+ )
583
+ jailbreak_clear_all.click(
584
+ fn=lambda: [],
585
+ outputs=jailbreak_type_checkbox
586
+ )
587
+
588
  for component in [model_checkbox, jailbreak_type_checkbox]:
589
  component.change(
590
  fn=update_model_view,
 
601
  with gr.Tab("โš”๏ธ Attack View"):
602
  gr.Markdown("### Compare attack methods across different models")
603
 
604
+ with gr.Row():
605
+ attack_select_all = gr.Button("โœ“ Select All Attacks", size="sm")
606
+ attack_clear_all = gr.Button("โœ— Clear All Attacks", size="sm")
607
+
608
  attack_checkbox = gr.CheckboxGroup(
609
  choices=get_attack_methods(),
610
  label="๐ŸŽฏ Select Attack Methods",
 
624
  return filter_by_attack(attacks, [eval_method])
625
 
626
 
627
+ # ๅ…จ้€‰/ๅ–ๆถˆๅ…จ้€‰ๆŒ‰้’ฎไบ‹ไปถ
628
+ attack_select_all.click(
629
+ fn=lambda: get_attack_methods(),
630
+ outputs=attack_checkbox
631
+ )
632
+ attack_clear_all.click(
633
+ fn=lambda: [],
634
+ outputs=attack_checkbox
635
+ )
636
+
637
  for component in [attack_checkbox, evaluation_method_radio]:
638
  component.change(
639
  fn=update_attack_view,
 
647
  outputs=attack_table
648
  )
649
 
650
+ with gr.Row():
651
+ with gr.Column():
652
+ gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (rule-based GCG judge) ")
653
+ gr.Image(
654
+ value="./figs/GCG_attack_model.jpg",
655
+ interactive=True
656
+ )
657
+
658
+ with gr.Column():
659
+ gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (gpt-4o-based PAIR judge)")
660
+ gr.Image(
661
+ value="./figs/attack_model_heatmap.jpg",
662
+ interactive=True
663
+ )
664
+
665
+ with gr.Column():
666
+ gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (Llama-3.3-70B-based PAIR judge) ")
667
+ gr.Image(
668
+ value="./figs/PAIR_llama_attack_model.jpg",
669
+ interactive=True
670
+ )
671
+
672
+ with gr.Column():
673
+ gr.Markdown("### ๐Ÿ“ˆ Attack Model Visualization (Qwen2.5-72B-based PAIR judge)")
674
+ gr.Image(
675
+ value="./figs/PAIR_qwen_attack_model.jpg",
676
+ interactive=True
677
+ )
678
+
679
  with gr.Tab("๐Ÿ›ก๏ธ Defense View"):
680
  gr.Markdown("### Compare defense methods against different attacks")
681
 
682
+ with gr.Row():
683
+ defense_select_all = gr.Button("โœ“ Select All Defenses", size="sm")
684
+ defense_clear_all = gr.Button("โœ— Clear All Defenses", size="sm")
685
+
686
  defense_checkbox = gr.CheckboxGroup(
687
  choices=get_defense_methods(),
688
  label="๐Ÿ›ก๏ธ Select Defense Methods",
 
702
  return filter_by_defense(defenses, [eval_method])
703
 
704
 
705
+ # ๅ…จ้€‰/ๅ–ๆถˆๅ…จ้€‰ๆŒ‰้’ฎไบ‹ไปถ
706
+ defense_select_all.click(
707
+ fn=lambda: get_defense_methods(),
708
+ outputs=defense_checkbox
709
+ )
710
+ defense_clear_all.click(
711
+ fn=lambda: [],
712
+ outputs=defense_checkbox
713
+ )
714
+
715
  for component in [defense_checkbox, evaluation_method_radio_defense]:
716
  component.change(
717
  fn=update_defense_view,
 
725
  outputs=defense_table
726
  )
727
 
728
+ with gr.Row():
729
+ with gr.Column():
730
+ gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (rule-based GCG judge) ")
731
+ gr.Image(
732
+ value="./figs/GCG_defense_model.jpg",
733
+ interactive=True
734
+ )
735
+
736
+ with gr.Column():
737
+ gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (gpt-4o-based PAIR judge)")
738
+ gr.Image(
739
+ value="./figs/defense_model_heatmap.jpg",
740
+ interactive=True
741
+ )
742
+
743
+ with gr.Column():
744
+ gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (Llama-3.3-70B-based PAIR judge) ")
745
+ gr.Image(
746
+ value="./figs/PAIR_llama_defense_model.jpg",
747
+ interactive=True
748
+ )
749
+
750
+ with gr.Column():
751
+ gr.Markdown("### ๐Ÿ“ˆ Defense Model Visualization (Qwen2.5-72B-based PAIR judge)")
752
+ gr.Image(
753
+ value="./figs/PAIR_qwen_defense_model.jpg",
754
+ interactive=True
755
+ )
756
+
757
  with gr.Tab("๐Ÿ“Š Jailbreak Type View"):
758
  gr.Markdown("### Comprehensive statistics across all dimensions")
759
 
figs/PAIR_llama_attack_model.jpg ADDED

Git LFS Details

  • SHA256: c7dbbc7abb2b2e9ff33f41280f112dc0aa256b8b8626ebc9ccda486a5dd901c8
  • Pointer size: 132 Bytes
  • Size of remote file: 1.95 MB
figs/PAIR_llama_defense_model.jpg ADDED

Git LFS Details

  • SHA256: d5d244faaf3c834d36f0b42b26f290f08a18591c46e785a40ef50822258adb1e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.26 MB
figs/PAIR_qwen_attack_model.jpg ADDED

Git LFS Details

  • SHA256: f733d64d056d5e1f2187e33a3cac9cd7e0488f09d61ebd39483d4cabcb616624
  • Pointer size: 132 Bytes
  • Size of remote file: 2.09 MB
figs/PAIR_qwen_defense_model.jpg ADDED

Git LFS Details

  • SHA256: 2eea61f9d9c10f89f5af3ef406aa18b4834dc9bb4bc956d10690fc25e5412233
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
figs/attack_model_heatmap.jpg ADDED

Git LFS Details

  • SHA256: d3eda0667ae9b93dc1c933810d57e28ac8dbf430eda291fc1f2252e2f5e16a24
  • Pointer size: 132 Bytes
  • Size of remote file: 2.15 MB
figs/defense_model_heatmap.jpg ADDED

Git LFS Details

  • SHA256: 41aa29009f5cc122b10bda979b29f222486658f64971795e7f7ce743a1d50c55
  • Pointer size: 132 Bytes
  • Size of remote file: 1.36 MB