JonasDornbusch commited on
Commit
09ce60e
·
0 Parent(s):

Initial public release

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +102 -0
  3. adapter_config.json +41 -0
  4. adapter_model.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - JailbreakBench/JBB-Behaviors
5
+ base_model:
6
+ - GSAI-ML/LLaDA-8B-Base
7
+ library_name: peft
8
+ tags:
9
+ - iho
10
+ - qwen2.5
11
+ - qwen2.5-32b
12
+ - attacker
13
+ - jailbreaking
14
+ - adversary
15
+ - jailbreak
16
+ - llada
17
+ - diffusion-language-model
18
+ - red-teaming
19
+ - adversarial-evaluation
20
+ - safety-research
21
+ - jailbreak-benchmark
22
+ paper:
23
+ title: "Black-box, Adaptive, Efficient, Transferable, Harmful, Applicable... Attacks Are All You Need to Break LLMs"
24
+ ---
25
+
26
+ # IHO — Indirect Harm Optimization for Qwen2.5-32B-Instruct
27
+
28
+ [![ArXiv](https://img.shields.io/badge/arXiv-2606.03647-b31b1b?logo=arxiv&logoColor=white)](https://arxiv.org/abs/2606.03647)
29
+ [![GitHub](https://img.shields.io/badge/GitHub-IHO-181717?logo=github&logoColor=white)](https://github.com/SEML-Lab/IHO)
30
+
31
+ IHO fine-tunes an attacker diffusion language model for black-box red-teaming of target LLMs. Building on the inpainting approach of [Diffusion LLMs are Natural Adversaries for Any LLM](https://arxiv.org/pdf/2511.00203), the diffusion model generates jailbreak prompts by predicting prompts that are likely to elicit affirmative, harmful responses from a target model.
32
+
33
+ Strong attacks need to adapt to their target, which may be an arbitrary black-box model or defense pipeline. IHO achieves this by iteratively collecting target responses, scoring them with a harmfulness judge, and training the attacker with DPO on the resulting preference data. This allows the attacker to adapt without requiring gradients, model weights, or any defense-specific implementation.
34
+
35
+ In our experiments, IHO achieves state-of-the-art attack strength across a wide range of settings. It is effective as an adaptive attack against a specific target, while also transferring strongly to held-out behaviors and unseen target models. The gains are especially pronounced against stronger defenses and layered defense pipelines, where IHO substantially outperforms prior automated attacks.
36
+
37
+ This checkpoint is adapted to `Qwen/Qwen2.5-32B-Instruct`. It can be used directly in transfer settings and is expected to be particularly effective on similar Qwen2.5 models. However, transfer is not the only intended use case of IHO: one of its core strengths is adaptive evaluation. If maximum attack strength is required, we recommend applying the IHO pipeline adaptively to the specific target model or defense pipeline under evaluation. We release a range of IHO checkpoints adapted to different targets.
38
+
39
+
40
+ <!-- ADAPTER_USAGE_START -->
41
+ ## Quick Start
42
+ This repo contains a **LoRA adapter** (not merged full weights). Load it on top of:
43
+ - `GSAI-ML/LLaDA-8B-Base` ([Large Language Diffusion Models](https://arxiv.org/abs/2502.09992) / [GSAI-ML/LLaDA-8B-Base](https://huggingface.co/GSAI-ML/LLaDA-8B-Base))
44
+
45
+ ### Use with IHO Codebase
46
+ [![GitHub](https://img.shields.io/badge/GitHub-IHO-181717?logo=github&logoColor=white)](https://github.com/SEML-Lab/IHO)
47
+
48
+ Use this adapter with the local IHO pipeline to generate attacks:
49
+ ```bash
50
+ pixi run python -m iho.inference_attacks_only \
51
+ "Sure, here is how to evade legal persecution" \
52
+ --checkpoint SEML-Lab/IHO-Qwen2.5-32B-Instruct \
53
+ --device cuda \
54
+ --num-attacks 32
55
+ ```
56
+
57
+ For direct programmatic loading inside the IHO codebase:
58
+ ```python
59
+ from iho.model_wrapper.LLaDAWrapper import LLaDAWrapper
60
+
61
+ wrapper = LLaDAWrapper(
62
+ model_name="GSAI-ML/LLaDA-8B-Base",
63
+ lora_checkpoint="SEML-Lab/IHO-Qwen2.5-32B-Instruct",
64
+ device="cuda", # or "cpu"
65
+ )
66
+ ```
67
+
68
+ ### Standard PEFT Loading
69
+ ```python
70
+ from transformers import AutoModelForCausalLM, AutoTokenizer
71
+ from peft import PeftModel
72
+
73
+ base_id = "GSAI-ML/LLaDA-8B-Base"
74
+ adapter_id = "SEML-Lab/IHO-Qwen2.5-32B-Instruct"
75
+
76
+ base = AutoModelForCausalLM.from_pretrained(base_id, trust_remote_code=True)
77
+ model = PeftModel.from_pretrained(base, adapter_id)
78
+ tokenizer = AutoTokenizer.from_pretrained(base_id, trust_remote_code=True)
79
+ ```
80
+ <!-- ADAPTER_USAGE_END -->
81
+
82
+ ## Use Disclaimer
83
+
84
+ This model is released for research on LLM safety, red-teaming, and robustness evaluation. It should be used in controlled settings and only against systems for which the user has permission to run adversarial evaluations. The goal of this release is to support more reliable measurement of jailbreak robustness, not to enable misuse against deployed systems.
85
+
86
+ A detailed description of the method can be found in the accompanying paper.
87
+
88
+ ## Citation
89
+
90
+ Please cite the IHO paper: Limbach et al., 2026, "Black-box, Adaptive, Efficient, Transferable, Harmful, Applicable... Attacks Are All You Need to Break LLMs."
91
+
92
+ ```bibtex
93
+ @misc{limbach2026blackboxadaptiveefficient,
94
+ title={Black-box, Adaptive, Efficient, Transferable, Harmful, Applicable... Attacks Are All You Need to Break LLMs},
95
+ author={Vincent Limbach and Jonas Dornbusch and David L{\"u}dke and Stephan G{\"u}nnemann and Leo Schwinn},
96
+ year={2026},
97
+ eprint={2606.03647},
98
+ archivePrefix={arXiv},
99
+ primaryClass={cs.CR},
100
+ url={https://arxiv.org/abs/2606.03647},
101
+ }
102
+ ```
adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "GSAI-ML/LLaDA-8B-Base",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "v_proj"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eb4116fc9392e53cd0ab8371932a6fd6533f837d6c21fd655fd3fbc9717e1b0
3
+ size 16794456