OnAnOrange commited on
Commit
cb2d4b3
·
verified ·
1 Parent(s): 1cd2923

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ <center> <div style="text-align: center;"> <img src="https://raw.githubusercontent.com/ZHZisZZ/dllm/main/assets/logo.gif" width="400" />
6
+ </div> </center>
7
+
8
+ # Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1
9
+
10
+ Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 is a diffusion-based language model created by transforming the autoregressive backbone [Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct) into a diffusion architecture and fine-tuning it using block diffusion techniques within the [dLLM](https://github.com/ZHZisZZ/dllm) framework.
11
+
12
+ ## Model Overview
13
+
14
+ Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 has the following features:
15
+
16
+ <!-- - **Architecture**: Transformer encoder with 8192-token context -->
17
+ - **Training Objective**: [Block Discrete Denoising Diffusion Language Models (BD3-LMs)](https://arxiv.org/pdf/2503.09573)
18
+ - **Framework**: [dLLM](https://github.com/ZHZisZZ/dllm)
19
+ - **Base Model**: [Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct)
20
+ - **Datasets**: [opc-sft-stage1](https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage1) and [opc-sft-stage2](https://huggingface.co/datasets/OpenCoder-LLM/opc-sft-stage2)
21
+
22
+ For training details, see the [W&B report](https://wandb.ai/asap-zzhou/dllm/reports/dLLM-Tiny-A2D--VmlldzoxNTI2NTEzOA).
23
+
24
+ ## Installation
25
+
26
+ ```shell
27
+ pip install torch transformers accelerate
28
+ ```
29
+
30
+ ## Quick Start
31
+
32
+ ```python
33
+ import math
34
+ import copy
35
+
36
+ import torch
37
+ import torch.nn.functional as F
38
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
39
+
40
+
41
+ def add_gumbel_noise(logits, temperature):
42
+ if temperature == 0:
43
+ return logits
44
+ logits = logits.to(torch.float64)
45
+ noise = torch.rand_like(logits, dtype=torch.float64)
46
+ g = (-torch.log(noise)) ** temperature
47
+ return logits.exp() / g
48
+
49
+
50
+ def get_num_transfer_tokens(mask_index, steps):
51
+ mask_num = mask_index.sum(dim=1, keepdim=True)
52
+ base = mask_num // steps
53
+ rem = mask_num % steps
54
+ out = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.long) + base
55
+ for i in range(mask_num.size(0)):
56
+ out[i, : rem[i]] += 1
57
+ return out
58
+
59
+
60
+ def build_staircase_attention_mask(x, block_size, pad_id):
61
+ B, T = x.shape
62
+ device = x.device
63
+
64
+ valid = x != pad_id
65
+ pos_raw = torch.cumsum(valid.long(), dim=-1)
66
+ position_ids = torch.where(valid, pos_raw - 1, torch.zeros_like(pos_raw)).long()
67
+
68
+ col = torch.arange(T, device=device)
69
+ block_ids = (col // block_size).view(1, T).expand(B, T)
70
+ block_ids = torch.where(valid, block_ids, torch.full_like(block_ids, -1))
71
+
72
+ q = block_ids.view(B, 1, T, 1)
73
+ k = block_ids.view(B, 1, 1, T)
74
+ attn = (k <= q) & (q >= 0) & (k >= 0)
75
+
76
+ return attn, position_ids
77
+
78
+
79
+ def diffusion_step_block(logits, x_block, mask_block, num_transfer, temperature, remasking):
80
+ B, L, _ = logits.shape
81
+ if not mask_block.any():
82
+ return x_block
83
+
84
+ noisy = add_gumbel_noise(logits, temperature)
85
+ x0 = noisy.argmax(dim=-1)
86
+
87
+ if remasking == "low_confidence":
88
+ p = F.softmax(logits, dim=-1)
89
+ conf = p.gather(-1, x0.unsqueeze(-1)).squeeze(-1)
90
+ elif remasking == "random":
91
+ conf = torch.rand((B, L), device=logits.device)
92
+ else:
93
+ raise ValueError(remasking)
94
+
95
+ x0 = torch.where(mask_block, x0, x_block)
96
+ neg_inf = torch.full_like(conf, -float("inf"))
97
+ conf = torch.where(mask_block, conf, neg_inf)
98
+
99
+ commit = torch.zeros_like(x_block, dtype=torch.bool)
100
+ for i in range(B):
101
+ k = int(num_transfer[i].item())
102
+ if k > 0:
103
+ valid = (conf[i] > -float("inf")).sum().item()
104
+ k = min(k, valid)
105
+ _, idx = torch.topk(conf[i], k)
106
+ commit[i, idx] = True
107
+
108
+ out = x_block.clone()
109
+ out[commit] = x0[commit]
110
+ return out
111
+
112
+
113
+ @torch.no_grad()
114
+ def generate(
115
+ model,
116
+ tokenizer,
117
+ prompt,
118
+ steps=128,
119
+ max_new_tokens=128,
120
+ block_size=32,
121
+ temperature=0.0,
122
+ cfg_scale=0.0,
123
+ remasking="low_confidence",
124
+ ):
125
+ device = model.device
126
+ mask_id = tokenizer.mask_token_id
127
+ bos_id = tokenizer.bos_token_id
128
+ pad_id = tokenizer.pad_token_id
129
+
130
+ prompt = torch.tensor(prompt, device=device).long()
131
+ B = 1
132
+ T0 = len(prompt)
133
+ x = prompt
134
+
135
+ num_blocks = math.ceil(max_new_tokens / block_size)
136
+ steps_per_block = math.ceil(steps / num_blocks)
137
+ generated = 0
138
+
139
+ while generated < max_new_tokens:
140
+ T_prefix = x.size(1)
141
+ offset = T_prefix % block_size
142
+ room = block_size if offset == 0 else block_size - offset
143
+ cur_len = min(room, max_new_tokens - generated)
144
+ if cur_len <= 0:
145
+ break
146
+
147
+ attn_pfx, pos_pfx = build_staircase_attention_mask(x, block_size, pad_id)
148
+
149
+ out = model(x, attention_mask=attn_pfx, position_ids=pos_pfx, use_cache=True)
150
+ cond_past = out.past_key_values
151
+ prefix_logits = out.logits[:, -1:, :]
152
+
153
+ if cfg_scale > 0:
154
+ un_x = x.clone()
155
+ un_x[:] = mask_id
156
+ out_un = model(un_x, attention_mask=attn_pfx, position_ids=pos_pfx, use_cache=True)
157
+ uncond_past = out_un.past_key_values
158
+ else:
159
+ uncond_past = None
160
+
161
+ block = torch.full((B, cur_len), mask_id, device=device, dtype=torch.long)
162
+ x = torch.cat([x, block], dim=1)
163
+ T_total = x.size(1)
164
+
165
+ block_mask = x[:, -cur_len:] == mask_id
166
+ num_transfer = get_num_transfer_tokens(block_mask, steps_per_block)
167
+ eff_steps = num_transfer.size(1)
168
+
169
+ full_attn, full_pos = build_staircase_attention_mask(x, block_size, pad_id)
170
+ attn_blk = full_attn[:, :, T_prefix:T_total, :]
171
+ pos_blk = full_pos[:, T_prefix:T_total]
172
+
173
+ for t in range(eff_steps):
174
+ x_blk = x[:, T_prefix:T_total]
175
+ m_blk = x_blk == mask_id
176
+
177
+ cond_logits = model(
178
+ x_blk, attention_mask=attn_blk, position_ids=pos_blk,
179
+ past_key_values=copy.deepcopy(cond_past), use_cache=False
180
+ ).logits
181
+
182
+ logits = cond_logits
183
+ if cfg_scale > 0:
184
+ un_logits = model(
185
+ x_blk, attention_mask=attn_blk, position_ids=pos_blk,
186
+ past_key_values=copy.deepcopy(uncond_past), use_cache=False
187
+ ).logits
188
+ logits = un_logits + (cfg_scale + 1.0) * (cond_logits - un_logits)
189
+
190
+ x_blk_new = diffusion_step_block(
191
+ logits, x_blk, m_blk, num_transfer[:, t], temperature, remasking
192
+ )
193
+ x[:, T_prefix:T_total] = x_blk_new
194
+
195
+ if (x_blk_new == tokenizer.eos_token_id).any():
196
+ break
197
+
198
+ generated += cur_len
199
+
200
+ return x
201
+
202
+
203
+ device = "cuda"
204
+ model = AutoModelForMaskedLM.from_pretrained("dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1", dtype=torch.bfloat16, trust_remote_code=True).to(device).eval()
205
+ tokenizer = AutoTokenizer.from_pretrained("dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1", trust_remote_code=True)
206
+
207
+ prompt = "Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours?"
208
+ m = [
209
+ {"role": "system", "content": "You are a helpful AI assistant."},
210
+ {"role": "user", "content": prompt}
211
+ ]
212
+ prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
213
+
214
+ input_ids = tokenizer(prompt)["input_ids"]
215
+ input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
216
+ text = generate(model,tokenizer, input_ids, steps=256, max_new_tokens=256, block_size=32, temperature=0.0, cfg_scale=0.0, remasking="low_confidence")
217
+ print(tokenizer.batch_decode(text[:, input_ids.shape[1]:], skip_special_tokens=False)[0])
218
+
219
+ ```
220
+
221
+ ## Generation Parameters
222
+
223
+ | Parameter | Description | Default |
224
+ | ---------------- | ---------------------------------------------------------------------------------------------- | -------- |
225
+ | `max_new_tokens` | Number of tokens to generate | 256 |
226
+ | `steps` | Number of diffusion denoising iterations | 256 |
227
+ | `temperature` | Sampling temperature; set to `0.0` for deterministic generation | 0.0 |
228
+ | `block_size` | Token block size used during iterative denoising | 32 |
229
+ | `cfg_scale` | Classifier-free guidance scale controlling instruction adherence (higher = more deterministic) | 0.0 |
230
+ | `remasking` | Strategy for re-masking during each denoising step (`random` or `low_confidence`) | `low_confidence` |
231
+
232
+ ## Command-Line Interface
233
+
234
+ Follow the Github repo's demo script [examples/a2d/bm3lm/chat.py](https://github.com/ZHZisZZ/dllm/blob/main/examples/a2d/bm3lm/chat.py) for visualized generation:
235
+
236
+ ```shell
237
+ python -u examples/a2d/bm3lm/chat.py \
238
+ --model_name_or_path dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 \
239
+ --chat True
240
+ ```
241
+
242
+ ## Evaluation
243
+
244
+ <table style="border-collapse: collapse; width: 60%; text-align: center;">
245
+ <thead>
246
+ <tr style="border-bottom: 3px solid #333;">
247
+ <th style="padding: 8px; min-width: 320px; text-align: left;">Model         </th>
248
+ <th style="padding: 8px;">HumanEval</th>
249
+ <th style="padding: 8px;">MBPP</th>
250
+ </tr>
251
+ </thead>
252
+
253
+ <!-- Diffusion model v1.1 highlighted -->
254
+ <tr style="background-color: #e8f2ff;">
255
+ <td style="padding: 8px;"><a href="https://huggingface.co/dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1"><code>Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1</code></a> (evaluated)</td>
256
+ <td>41.5</td><td>33.6</td>
257
+ </tr>
258
+
259
+ <!-- Diffusion model v0.1 highlighted -->
260
+ <tr style="background-color: #e8f2ff;">
261
+ <td style="padding: 8px;"><a href="https://huggingface.co/dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v0.1"><code>Qwen2.5-Coder-0.5B-Instruct-diffusion-v0.1</code></a> (evaluated)</td>
262
+ <td>28.1</td><td>23.0</td>
263
+ </tr>
264
+
265
+ <tr style="background-color: #e8f2ff;">
266
+ <td style="padding: 8px;"><a href="https://huggingface.co/fredzzp/open-dcoder-0.5B"><code>open-dcoder-0.5B</code></a> (reported)</td>
267
+ <td>20.8</td><td>35.2</td>
268
+ </tr>
269
+ <tr>
270
+ <td colspan="3" style="padding: 0; border-top: 3px double #666;"></td>
271
+ </tr>
272
+
273
+ <tr>
274
+ <td style="padding: 8px;"><a href="https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct"><code>Qwen2.5-Coder-0.5B-Instruct</code></a> (reported)</td>
275
+ <td>28.0</td><td>52.9</td>
276
+ </tr>
277
+
278
+ </table>
279
+
280
+ To automatically evaluate Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 on all benchmarks, run:
281
+ ```shell
282
+ bash examples/a2d/eval_bm3lm.sh \
283
+ --model_name_or_path dllm-collection/Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1
284
+ ```
285
+
286
+
287
+ ## Citation
288
+
289
+ If you use Qwen2.5-Coder-0.5B-Instruct-diffusion-v1.1 or dLLM, please cite:
290
+
291
+ ```bibtex
292
+ @misc{dllm,
293
+ author = {Zhanhui Zhou and Lingjie Chen and Hanghang Tong and Dawn Song},
294
+ title = {dLLM: Simple Diffusion Language Modeling},
295
+ year = {2025},
296
+ howpublished = {\url{https://github.com/ZHZisZZ/dllm}},
297
+ }
298
+ ```
added_tokens.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|mask|>": 151665,
16
+ "<|object_ref_end|>": 151647,
17
+ "<|object_ref_start|>": 151646,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|video_pad|>": 151656,
22
+ "<|vision_end|>": 151653,
23
+ "<|vision_pad|>": 151654,
24
+ "<|vision_start|>": 151652
25
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "A2DQwen2LMHeadModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_qwen2.A2DQwen2Config",
7
+ "AutoModel": "modeling_qwen2.A2DQwen2Model",
8
+ "AutoModelForMaskedLM": "modeling_qwen2.A2DQwen2LMHeadModel"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 151643,
12
+ "dtype": "bfloat16",
13
+ "eos_token_id": 151645,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 896,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 4864,
18
+ "layer_types": [
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention"
43
+ ],
44
+ "max_position_embeddings": 32768,
45
+ "max_window_layers": 24,
46
+ "model_type": "a2d-qwen2",
47
+ "num_attention_heads": 14,
48
+ "num_hidden_layers": 24,
49
+ "num_key_value_heads": 2,
50
+ "pad_token_id": 151643,
51
+ "rms_norm_eps": 1e-06,
52
+ "rope_scaling": null,
53
+ "rope_theta": 1000000.0,
54
+ "sliding_window": null,
55
+ "tie_word_embeddings": true,
56
+ "transformers_version": "4.57.0",
57
+ "use_cache": true,
58
+ "use_sliding_window": false,
59
+ "vocab_size": 151936
60
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": [
5
+ 151645
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.57.0"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1724b7f2e845ab597ad34defbfa61073551f7c8333f769a4558a0536849517be
3
+ size 1260367448
modeling_qwen2.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ from torch import nn
5
+
6
+ import transformers
7
+ from transformers.cache_utils import Cache, DynamicCache
8
+ from transformers.modeling_outputs import BaseModelOutputWithPast
9
+ from transformers.processing_utils import Unpack
10
+ from transformers.utils import TransformersKwargs
11
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
12
+
13
+ if transformers.utils.is_torch_flex_attn_available():
14
+ from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size
15
+ from torch.nn.attention.flex_attention import BlockMask, create_block_mask
16
+ else:
17
+ # Register a fake type to avoid crashing for annotations and `isinstance` checks
18
+ BlockMask = torch.Tensor
19
+
20
+
21
+ class A2DQwen2Config(transformers.Qwen2Config):
22
+ model_type = "a2d-qwen2" # <- NEW model_type
23
+
24
+
25
+ class A2DQwen2Model(transformers.Qwen2Model):
26
+
27
+ def forward(
28
+ self,
29
+ input_ids: Optional[torch.LongTensor] = None,
30
+ attention_mask: Optional[torch.Tensor] = None,
31
+ position_ids: Optional[torch.LongTensor] = None,
32
+ past_key_values: Optional[Cache] = None,
33
+ inputs_embeds: Optional[torch.FloatTensor] = None,
34
+ use_cache: Optional[bool] = None,
35
+ cache_position: Optional[torch.LongTensor] = None,
36
+ **kwargs: Unpack[TransformersKwargs],
37
+ ) -> BaseModelOutputWithPast:
38
+ if (input_ids is None) ^ (inputs_embeds is not None):
39
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
40
+
41
+ if inputs_embeds is None:
42
+ inputs_embeds = self.embed_tokens(input_ids)
43
+
44
+ if use_cache and past_key_values is None:
45
+ past_key_values = DynamicCache(config=self.config)
46
+
47
+ if cache_position is None:
48
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
49
+ cache_position = torch.arange(
50
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
51
+ )
52
+
53
+ if position_ids is None:
54
+ position_ids = cache_position.unsqueeze(0)
55
+
56
+ """
57
+ # -------------------------------------------------------------
58
+ # ORIGINAL CODE (causal mask)
59
+ # -------------------------------------------------------------
60
+ # It may already have been prepared by e.g. `generate`
61
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
62
+ # Prepare mask arguments
63
+ mask_kwargs = {
64
+ "config": self.config,
65
+ "input_embeds": inputs_embeds,
66
+ "attention_mask": attention_mask,
67
+ "cache_position": cache_position,
68
+ "past_key_values": past_key_values,
69
+ "position_ids": position_ids,
70
+ }
71
+ # Create the masks
72
+ causal_mask_mapping = {
73
+ "full_attention": create_causal_mask(**mask_kwargs),
74
+ }
75
+ # The sliding window alternating layers are not always activated depending on the config
76
+ if self.has_sliding_layers:
77
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
78
+ # -------------------------------------------------------------
79
+ # ORIGINAL CODE (causal mask)
80
+ # -------------------------------------------------------------
81
+ """
82
+ # -------------------------------------------------------------
83
+ # NEW CODE (bidirectional, padding-only mask)
84
+ # -------------------------------------------------------------
85
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
86
+ # 1) If no mask is provided → treat all tokens as valid (no padding)
87
+ if attention_mask is None:
88
+ attention_mask = torch.ones(
89
+ inputs_embeds.shape[:2],
90
+ device=inputs_embeds.device,
91
+ dtype=torch.long
92
+ )
93
+
94
+ # 2) If mask is not already a 4D attention mask → convert it
95
+ if not (
96
+ isinstance(attention_mask, BlockMask)
97
+ or (isinstance(attention_mask, torch.Tensor) and attention_mask.ndim == 4)
98
+ ):
99
+ attention_mask = _prepare_4d_attention_mask(attention_mask, self.dtype)
100
+
101
+ # 3) Build causal mask mapping used by the attention layers
102
+ causal_mask_mapping = {"full_attention": attention_mask}
103
+
104
+ # Sliding-window layers share the same non-causal mask
105
+ if self.has_sliding_layers:
106
+ causal_mask_mapping["sliding_attention"] = attention_mask
107
+ # -------------------------------------------------------------
108
+ # NEW CODE (bidirectional, padding-only mask)
109
+ # -------------------------------------------------------------
110
+
111
+ hidden_states = inputs_embeds
112
+
113
+ # create position embeddings to be shared across the decoder layers
114
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
115
+
116
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
117
+ hidden_states = decoder_layer(
118
+ hidden_states,
119
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
120
+ position_ids=position_ids,
121
+ past_key_values=past_key_values,
122
+ use_cache=use_cache,
123
+ cache_position=cache_position,
124
+ position_embeddings=position_embeddings,
125
+ **kwargs,
126
+ )
127
+
128
+ hidden_states = self.norm(hidden_states)
129
+ return BaseModelOutputWithPast(
130
+ last_hidden_state=hidden_states,
131
+ past_key_values=past_key_values if use_cache else None,
132
+ )
133
+
134
+ class A2DQwen2LMHeadModel(transformers.Qwen2ForCausalLM):
135
+ config: A2DQwen2Config
136
+
137
+ def __init__(self, config):
138
+ transformers.Qwen2PreTrainedModel.__init__(self, config)
139
+ self.model = A2DQwen2Model(config)
140
+ self.vocab_size = config.vocab_size
141
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
142
+
143
+ # Initialize weights and apply final processing
144
+ self.post_init()
145
+
146
+
147
+ transformers.AutoConfig.register("a2d-qwen2", A2DQwen2Config)
148
+ transformers.AutoModel.register(A2DQwen2Config, A2DQwen2LMHeadModel)
149
+ transformers.AutoModelForMaskedLM.register(A2DQwen2Config, A2DQwen2LMHeadModel)
150
+
151
+
152
+ if __name__ == "__main__":
153
+ import dllm
154
+ import torch
155
+ from transformers import AutoModel
156
+
157
+ # Load a config from a local path (either a directory containing config.json, or the file itself)
158
+ config_path = dllm.utils.resolve_with_base_env(
159
+ "Qwen/Qwen2.5-0.5B", "BASE_MODELS_DIR"
160
+ )
161
+ config = A2DQwen2Config.from_pretrained(config_path)
162
+ if hasattr(config, "auto_map"):
163
+ delattr(config, "auto_map")
164
+ if hasattr(config, "architectures"):
165
+ delattr(config, "architectures")
166
+
167
+ torch.set_default_device("cuda")
168
+ model = A2DQwen2LMHeadModel(config)
169
+ model.save_pretrained("models-tmp/a2d-qwen2")
170
+ auto_model = AutoModel.from_pretrained("models-tmp/a2d-qwen2")
171
+
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "bos_token": "<|endoftext|>",
18
+ "eos_token": {
19
+ "content": "<|im_end|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "mask_token": {
26
+ "content": "<|mask|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "pad_token": {
33
+ "content": "<|endoftext|>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59820ad3f728fff77cf7e4188532fc45e5f80cd0299cde28046bd2b51c64bdf
3
+ size 11422081
tokenizer_config.json ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|mask|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ }
189
+ },
190
+ "additional_special_tokens": [
191
+ "<|im_start|>",
192
+ "<|im_end|>",
193
+ "<|object_ref_start|>",
194
+ "<|object_ref_end|>",
195
+ "<|box_start|>",
196
+ "<|box_end|>",
197
+ "<|quad_start|>",
198
+ "<|quad_end|>",
199
+ "<|vision_start|>",
200
+ "<|vision_end|>",
201
+ "<|vision_pad|>",
202
+ "<|image_pad|>",
203
+ "<|video_pad|>"
204
+ ],
205
+ "bos_token": "<|endoftext|>",
206
+ "clean_up_tokenization_spaces": false,
207
+ "eos_token": "<|im_end|>",
208
+ "errors": "replace",
209
+ "extra_special_tokens": {},
210
+ "mask_token": "<|mask|>",
211
+ "model_max_length": 32768,
212
+ "pad_token": "<|endoftext|>",
213
+ "padding_side": "right",
214
+ "split_special_tokens": false,
215
+ "tokenizer_class": "Qwen2Tokenizer",
216
+ "unk_token": null
217
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9097661c1514bf9514858f758273a38b09545a96d1619951d90b456d240e3ddc
3
+ size 6840
vocab.json ADDED
The diff for this file is too large to render. See raw diff