Clean up rope params; ensure transformers 4.55/5.0 compatibility

#1
by abhgarg - opened
.gitattributes CHANGED
@@ -34,8 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
- assets/demo.gif filter=lfs diff=lfs merge=lfs -text
38
- assets/demo.mp4 filter=lfs diff=lfs merge=lfs -text
39
- assets/result_acc.png filter=lfs diff=lfs merge=lfs -text
40
- assets/result_efficiency.png filter=lfs diff=lfs merge=lfs -text
41
- assets/teaser.png filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
README.md CHANGED
@@ -1,74 +1,19 @@
1
  ---
2
  library_name: transformers
3
- license: other
4
- license_name: nvidia-nemotron-open-model-license
5
- license_link: >-
6
- https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-nemotron-open-model-license/
7
- pipeline_tag: text-generation
8
- tags:
9
- - nvidia
10
- - pytorch
11
  ---
12
 
13
- # Nemotron-Labs-Diffusion-3B-Base
14
 
 
15
 
16
- <div align="center" style="line-height: 1;">
17
- <a href="https://d1qx31qr3h6wln.cloudfront.net/publications/Nemotron_Diffusion_Tech_Report_v1.pdf?VersionId=db8_EMO8B.vmU26.jr7Le9pN3MqcUDNL" target="_blank" style="margin: 2px;">
18
- <img alt="Chat" src="https://img.shields.io/badge/📝Paper-Read Now!-536af5?color=76B900&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
19
- </a>
20
- <a href="https://huggingface.co/collections/nvidia/nemotron-labs-diffusion" target="_blank" style="margin: 2px;">
21
- <img alt="Nemotron-Labs-Diffusion Model Family" src="https://img.shields.io/badge/%F0%9F%A4%97-Nemotron--Labs--Diffusion_Model_Family-76B900" style="display: inline-block; vertical-align: middle;"/>
22
- </a>
23
- <a href="https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-nemotron-open-model-license/" style="margin: 2px;">
24
- <img alt="License" src="https://img.shields.io/badge/License-NVIDIA Open Model License-f5de53?&color=f5de53" style="display: inline-block; vertical-align: middle;"/>
25
- </a>
26
- </div>
27
 
 
28
 
29
- [![Demo](./assets/demo.gif)](./assets/demo.mp4)
30
 
31
-
32
- ## Model Overview
33
-
34
- Nemotron-Labs-Diffusion is a tri-mode language model that supports both AR decoding and diffusion-based parallel decoding by simply switching the attention pattern of the same model during inference. The synergy between these two modes enables a third mode, called self-speculation: the same model performs diffusion-based parallel drafting and AR verification with shared KV cache, achieving high acceptance lengths and decoding efficiency. The seamless mode switching by simply changing attention patterns enables high efficiency at different concurrency levels in varying deployment scenarios with one single model.
35
-
36
- <div align="center">
37
- <img src="./assets/teaser.png" alt="An illustration of Tri-Mode LMs" width="500">
38
- </div>
39
-
40
-
41
- ## Highlights
42
-
43
- - SOTA 3B, 8B, 14B dense LM family (base, instruct, and vision-language variants) supporting AR, diffusion, and self-speculation with the focus on decode efficiency.
44
- - Generation moved from a memory-bound regime toward a compute-bound regime. Model weights are loaded once and reused to compute multiple tokens during generation.
45
- - Self-speculation uses diffusion for drafting and AR for verification, providing a stronger alternative to MTP approaches:
46
- * 3x higher acceptance length and 2.2x speed-up vs. Qwen3-8B-Eagle3 in SGLang.
47
- * 5.9× tokens per forward over Qwen3-8B (no MTP) with the same accuracy.
48
- - Real-device speed-up across platforms:
49
- * DGX Spark (8B, concurrency 1): 2.7x faster with 112 tok/sec vs. 41.8 tok/sec AR using w4a16.
50
- * GB200 (8B, concurrency 1): 3.3x faster with 850 tok/sec vs. 253 tok/sec AR and 360 tok/sec Eagle3. Custom CUDA kernels boost to 1015 tok/sec (4x).
51
- - Diffusion speedup-of-light analysis shows that throughput can be further doubled (vs. current best) for a single user with better sampling - future research.
52
-
53
-
54
- <div align="center">
55
- <img src="./assets/result_acc.png" alt="Efficiency Results" width="800">
56
- </div>
57
-
58
- <div align="center">
59
- <img src="./assets/result_efficiency.png" alt="Acc Results" width="800">
60
- </div>
61
-
62
-
63
- ## License/Terms of Use
64
-
65
- Use of this model is governed by the [NVIDIA Nemotron Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-nemotron-open-model-license/).
66
-
67
-
68
- ## Environment
69
-
70
- ```bash
71
- transformers>=5.0.0
72
  ```
73
 
74
  ## Chat with Our Model
@@ -78,83 +23,18 @@ transformers>=5.0.0
78
  from transformers import AutoModel, AutoTokenizer
79
  import torch
80
 
81
- repo_name = "nvidia/Nemotron-Labs-Diffusion-3B-Base"
82
 
83
  tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
84
  model = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
85
  model = model.cuda().to(torch.bfloat16)
86
 
87
- history = []
88
-
89
  user_input = input("User: ").strip()
90
- history.append({"role": "user", "content": user_input})
91
-
92
- prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
93
- prompt_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device='cuda')
94
-
95
- ## Chat in AR Mode
96
- out_ids, nfe = model.ar_generate(inputs.input_ids, max_new_tokens=512)
97
 
98
- ## Chat in dLM Mode
99
- out_ids, nfe = model.generate(prompt_ids, max_new_tokens=512, block_length=32, threshold=0.9, eos_token_id=tokenizer.eos_token_id)
100
-
101
- ## Chat in Linear Self-Speculation Mode
102
- out_ids, nfe = model.linear_spec_generate(prompt_ids, max_new_tokens=512, block_length=32, eos_token_id=tokenizer.eos_token_id)
103
 
104
  tokenized_out = tokenizer.batch_decode(out_ids[:, prompt_ids.shape[1]:], skip_special_tokens=True)[0]
105
  print(f"Model: {tokenized_out}")
106
  print(f"[Num Function Eval (NFE)={nfe}]")
107
- ```
108
-
109
-
110
-
111
- ## Inference with Linear Self-Speculation + LoRA-enhanced Drafter
112
-
113
- An optional LoRA adatper can be applied to the diffusion drafter in the linear self-speculation mode to further increase the acceptance length:
114
-
115
-
116
- ```python
117
- import torch
118
- from transformers import AutoModel, AutoTokenizer
119
- from peft import PeftModel
120
-
121
- repo = "nvidia/Nemotron-Labs-Diffusion-3B-Base"
122
- tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
123
- model = AutoModel.from_pretrained(repo, trust_remote_code=True)
124
- model = model.cuda().to(torch.bfloat16)
125
-
126
- # Attach the linear_spec LoRA adapter.
127
- model = PeftModel.from_pretrained(model, repo, subfolder="linear_spec_lora").eval()
128
- # Unwrap so we can call linear_spec_generate directly (it toggles LoRA internally).
129
- base = model.model
130
-
131
- history = [{"role": "user", "content": "Solve: What is 15% of 240?"}]
132
- prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
133
- prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
134
-
135
- out_ids, nfe = base.linear_spec_generate(
136
- prompt_ids, max_new_tokens=512, block_length=32,
137
- eos_token_id=tokenizer.eos_token_id,
138
- )
139
- print(tokenizer.decode(out_ids[0, prompt_ids.shape[1]:], skip_special_tokens=True))
140
- print(f"[NFE={nfe}]")
141
- ```
142
-
143
-
144
- ## Ethical Considerations
145
- NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. For more detailed information on ethical considerations for this model, please see the [bias](./model_cards/bias.md), [explainability](./model_cards/explainability.md), [safety & security](./model_cards/safety.md), and [privacy](./model_cards/privacy.md) subcards.
146
-
147
- Please report model quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
148
-
149
-
150
- ## Citations
151
-
152
- ```bibtex
153
- @techreport{fu2026nemotronlabsdiffusion,
154
- title = {Nemotron-Labs-Diffusion: A Tri-Mode Language Model Unifying Autoregressive, Diffusion, and Self-Speculation Decoding},
155
- author = {Yonggan Fu and Lexington Whalen and Abhinav Garg and Chengyue Wu and Maksim Khadkevich and Nicolai Oswald and Enze Xie and Daniel Egert and Sharath Turuvekere Sreenivas and Shizhe Diao and Chenhan Yu and Ye Yu and Weijia Chen and Sajad Norouzi and Shiyi Lan and Ligeng Zhu and Jin Wang and Jindong Jiang and Morteza Mardani and Mehran Maghoumi and Song Han and Ante Jukic and Nima Tajbakhsh and Jan Kautz and Pavlo Molchanov},
156
- institution = {NVIDIA},
157
- year = {2026},
158
- note = {Technical report}
159
- }
160
- ```
 
1
  ---
2
  library_name: transformers
3
+ tags: []
 
 
 
 
 
 
 
4
  ---
5
 
6
+ # Nemotron-Diffusion-Exp-Ministral-3B
7
 
8
+ Developed by [DLER team](https://nv-dler.github.io/) @ NVR and will be updated actively. Contact Yonggan Fu and Pavlo Molchanov for any question.
9
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Environment
12
 
13
+ Docker path: `/lustre/fsw/portfolios/nvr/users/yongganf/docker/megatron_py25_dllm_ministral.sqsh` on CW-DFW. Apply for interactive nodes with the following command:
14
 
15
+ ```
16
+ srun -A {account} --partition interactive --time 4:00:00 --gpus 8 --container-image /lustre/fsw/portfolios/nvr/users/yongganf/docker/megatron_py25_dllm_ministral.sqsh --container-mounts=$HOME:/home,/lustre:/lustre --pty bash
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ```
18
 
19
  ## Chat with Our Model
 
23
  from transformers import AutoModel, AutoTokenizer
24
  import torch
25
 
26
+ repo_name = "nvidia/Nemotron-Diffusion-Exp-Ministral-3B"
27
 
28
  tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
29
  model = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
30
  model = model.cuda().to(torch.bfloat16)
31
 
 
 
32
  user_input = input("User: ").strip()
 
 
 
 
 
 
 
33
 
34
+ prompt_ids = tokenizer(user_input,return_tensors='pt').input_ids.to(device='cuda')
35
+ out_ids, nfe = model.generate(prompt_ids, max_new_tokens=128, steps=128, block_length=32, shift_logits=False, causal_context=True, threshold=0.9)
 
 
 
36
 
37
  tokenized_out = tokenizer.batch_decode(out_ids[:, prompt_ids.shape[1]:], skip_special_tokens=True)[0]
38
  print(f"Model: {tokenized_out}")
39
  print(f"[Num Function Eval (NFE)={nfe}]")
40
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/demo.gif DELETED

Git LFS Details

  • SHA256: 0d09264e272ac0f82dee36417f6a16511287ec1f8dee3b5dba3da222d791fd2c
  • Pointer size: 132 Bytes
  • Size of remote file: 8.25 MB
assets/demo.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:666d8785ac4af75931d9c677757c4ef9945bf114d07f1c4e2ebb7b893ac39006
3
- size 9454873
 
 
 
 
assets/result_acc.png DELETED

Git LFS Details

  • SHA256: 992aa22ca9eca3d0bddbcd9f49837e2a9f377bbc0f7545563b129a50b3811448
  • Pointer size: 131 Bytes
  • Size of remote file: 405 kB
assets/result_efficiency.png DELETED

Git LFS Details

  • SHA256: 4f6161912e2aa703e0ef1bdccbb85039529b97e759d6247c33afa2a209806ede
  • Pointer size: 131 Bytes
  • Size of remote file: 801 kB
assets/teaser.png DELETED

Git LFS Details

  • SHA256: 6c94aa7b0c6cf8fb739724d0c1ce45749c76443c592eeab94d7cbb9083c6c6b1
  • Pointer size: 131 Bytes
  • Size of remote file: 581 kB
chat_utils.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+
5
+
6
+ def add_gumbel_noise(logits, temperature):
7
+ '''
8
+ The Gumbel max is a method for sampling categorical distributions.
9
+ According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
10
+ Thus, we use float64.
11
+ '''
12
+ if temperature == 0:
13
+ return logits
14
+ logits = logits.to(torch.float64)
15
+ noise = torch.rand_like(logits, dtype=torch.float64)
16
+ gumbel_noise = (- torch.log(noise)) ** temperature
17
+ return logits.exp() / gumbel_noise
18
+
19
+
20
+ def get_transfer_index(logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False):
21
+ logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
22
+ x0 = torch.argmax(logits_with_noise, dim=-1)
23
+
24
+ if remasking == 'low_confidence':
25
+ # p = F.softmax(logits.to(torch.float64), dim=-1)
26
+ p = F.softmax(logits, dim=-1)
27
+ x0_p = torch.squeeze(
28
+ torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
29
+ elif remasking == 'top_p_margin':
30
+ # Compute probabilities
31
+ p = F.softmax(logits, dim=-1) # (B, L, V)
32
+ # Top-2 per position
33
+ top2 = torch.topk(p, k=2, dim=-1).values # (B, L, 2)
34
+ margin = top2[..., 0] - top2[..., 1] # (B, L)
35
+
36
+ # Normalize margin to [0,1] over MASKED positions per row
37
+ plus_inf = torch.full_like(margin, float('inf'))
38
+ minus_inf = torch.full_like(margin, float('-inf'))
39
+ masked_for_min = torch.where(mask_index, margin, plus_inf)
40
+ masked_for_max = torch.where(mask_index, margin, minus_inf)
41
+ row_min = masked_for_min.amin(dim=1, keepdim=True) # (B, 1)
42
+ row_max = masked_for_max.amax(dim=1, keepdim=True) # (B, 1)
43
+ denom = (row_max - row_min)
44
+
45
+ # If denom==0 (all equal), set normalized=1 on masked; 0 elsewhere by default
46
+ normalized = torch.zeros_like(margin)
47
+ nonzero = denom > 0
48
+ normalized = torch.where(
49
+ mask_index & nonzero,
50
+ (margin - row_min) / (denom + 1e-12),
51
+ normalized
52
+ )
53
+ normalized = torch.where(
54
+ mask_index & (~nonzero),
55
+ torch.ones_like(normalized),
56
+ normalized
57
+ )
58
+ x0_p = normalized # ∈ [0,1] on masked positions
59
+ elif remasking == 'random':
60
+ x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
61
+ else:
62
+ raise NotImplementedError(remasking)
63
+
64
+ # Calculate negative entropy if requested
65
+ if neg_entropy:
66
+ # p = F.softmax(logits.to(torch.float64), dim=-1)
67
+ p = F.softmax(logits, dim=-1)
68
+ epsilon = 1e-10
69
+ log_probs = torch.log(p + epsilon)
70
+ confidence_scores = torch.sum(p * log_probs, dim=-1) # negative entropy per position
71
+ else:
72
+ confidence_scores = x0_p
73
+
74
+ x0 = torch.where(mask_index, x0, x)
75
+ confidence = torch.where(mask_index, confidence_scores, -np.inf)
76
+
77
+ transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
78
+ if threshold is not None:
79
+ num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
80
+ # print(f'confidence: {confidence}')
81
+ for j in range(confidence.shape[0]):
82
+ _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j])
83
+ transfer_index[j, select_index] = True
84
+ if threshold is not None:
85
+ for k in range(1, num_transfer_tokens[j]):
86
+ if confidence[j, select_index[k]] < threshold:
87
+ transfer_index[j, select_index[k]] = False
88
+ return x0, transfer_index
89
+
90
+
91
+ def get_num_transfer_tokens(mask_index, steps: int):
92
+ mask_num = mask_index.sum(dim=1, keepdim=True)
93
+ base = mask_num // steps
94
+ remainder = mask_num % steps
95
+ num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
96
+ for i in range(mask_num.size(0)):
97
+ num_transfer_tokens[i, : int(remainder[i])] += 1
98
+ return num_transfer_tokens
99
+
100
+
101
+ @torch.no_grad()
102
+ def generate_with_prefix_cache_block_diff(
103
+ model,
104
+ prompt,
105
+ steps=128,
106
+ gen_length=128,
107
+ block_length=128,
108
+ temperature=0.,
109
+ remasking='low_confidence',
110
+ mask_id=126336,
111
+ threshold=None,
112
+ factor=None,
113
+ shift_logits=False,
114
+ neg_entropy=False,
115
+ causal_context=False,
116
+ eos_token_id=None,
117
+ ):
118
+ dream_style=shift_logits
119
+ # Initialize the accumulator
120
+ x_accum = prompt.clone()
121
+
122
+ assert gen_length % block_length == 0
123
+ num_blocks = gen_length // block_length
124
+
125
+ assert steps % num_blocks == 0
126
+ steps_per_block = steps // num_blocks
127
+
128
+ nfe = 0
129
+
130
+ if causal_context:
131
+ model_module = model.module if hasattr(model, "module") else model
132
+ for layer in model_module.encoder.layers:
133
+ if hasattr(layer.self_attn, 'diffusion_lm'):
134
+ layer.self_attn.diffusion_lm=False
135
+
136
+ # Compute KV cache for the prompt initially
137
+ output = model(prompt, use_cache=True, use_causal_mask=causal_context)
138
+ past_key_values = output.past_key_values
139
+
140
+ if causal_context:
141
+ for layer in model_module.encoder.layers:
142
+ if hasattr(layer.self_attn, 'diffusion_lm'):
143
+ layer.self_attn.diffusion_lm=True
144
+
145
+ # For dream_style: store the "next token logit" of the context
146
+ next_logits_context = None
147
+ if dream_style:
148
+ next_logits_context = output.logits[:, -1:, :] # (B, 1, V)
149
+
150
+ for num_block in range(num_blocks):
151
+ # Create a new block with mask tokens (no seeding)
152
+ mask_block = torch.ones(
153
+ (prompt.shape[0], block_length),
154
+ dtype=prompt.dtype,
155
+ device=prompt.device
156
+ ) * mask_id
157
+
158
+ # Append the block of masks
159
+ x_accum = torch.cat([x_accum, mask_block], dim=1)
160
+ current_block_start = prompt.size(1) + num_block * block_length
161
+ block_slice = slice(current_block_start, current_block_start + block_length)
162
+
163
+ # Build the initial mask for this block
164
+ mask_block_idx0 = (x_accum[:, block_slice] == mask_id) # (B, Lb)
165
+
166
+ # Precompute the transfer schedule for this block
167
+ if dream_style:
168
+ # still denoise *all* positions (0..Lb-1), since none are seeded
169
+ schedule_mask = mask_block_idx0
170
+ else:
171
+ schedule_mask = mask_block_idx0
172
+
173
+ num_transfer_tokens = get_num_transfer_tokens(schedule_mask, steps_per_block) # (B, steps)
174
+
175
+ # Denoise the current block
176
+ for i in range(steps_per_block):
177
+ mask_block_idx = (x_accum[:, block_slice] == mask_id) # (B, Lb)
178
+ if mask_block_idx.sum() == 0:
179
+ break
180
+
181
+ nfe += 1
182
+
183
+ # Forward only the current noisy block using cached context
184
+ logits_block = model(
185
+ x_accum[:, block_slice],
186
+ past_key_values=past_key_values,
187
+ use_cache=False
188
+ ).logits
189
+
190
+ if dream_style:
191
+ # Align logits so that each masked position has a predictor:
192
+ # prepend context-next logit, then use logits_block[:-1]
193
+ if block_length == 1:
194
+ logits_use = next_logits_context # (B, 1, V)
195
+ else:
196
+ logits_use = torch.cat(
197
+ [next_logits_context, logits_block[:, :-1, :]],
198
+ dim=1
199
+ ) # (B, Lb, V)
200
+
201
+ mask_use = mask_block_idx # (B, Lb)
202
+ x_use = x_accum[:, block_slice] # (B, Lb)
203
+
204
+ x0, transfer_idx = get_transfer_index(
205
+ logits_use, temperature, remasking, mask_use, x_use,
206
+ num_transfer_tokens=num_transfer_tokens[:, i],
207
+ threshold=threshold, neg_entropy=neg_entropy
208
+ )
209
+ cur = x_accum[:, block_slice].clone()
210
+ cur[transfer_idx] = x0[transfer_idx]
211
+ x_accum[:, block_slice] = cur
212
+
213
+ else:
214
+ # non-AR (same-position) case
215
+ x0, transfer_idx = get_transfer_index(
216
+ logits_block, temperature, remasking, mask_block_idx,
217
+ x_accum[:, block_slice],
218
+ num_transfer_tokens=num_transfer_tokens[:, i],
219
+ threshold=threshold, neg_entropy=neg_entropy
220
+ )
221
+ cur = x_accum[:, block_slice].clone()
222
+ cur[transfer_idx] = x0[transfer_idx]
223
+ x_accum[:, block_slice] = cur
224
+
225
+ if eos_token_id is not None:
226
+ block_tokens = x_accum[:, block_slice] # (B, Lb)
227
+ eos_mask = (block_tokens == eos_token_id) # (B, Lb)
228
+ any_eos = eos_mask.any(dim=1) # (B,)
229
+ if any_eos.any():
230
+ after_eos = eos_mask.cumsum(dim=1).bool() # (B, Lb)
231
+ mask_before = (block_tokens == mask_id) & ~after_eos
232
+ if (any_eos & ~mask_before.any(dim=1)).any():
233
+ break
234
+
235
+ if causal_context:
236
+ for layer in model_module.encoder.layers:
237
+ if hasattr(layer.self_attn, 'diffusion_lm'):
238
+ layer.self_attn.diffusion_lm=False
239
+
240
+ # after block is fully denoised, update KV cache
241
+ output = model(
242
+ x_accum[:, block_slice],
243
+ past_key_values=past_key_values,
244
+ use_cache=True,
245
+ use_causal_mask=causal_context
246
+ )
247
+ past_key_values = output.past_key_values
248
+
249
+ if causal_context:
250
+ for layer in model_module.encoder.layers:
251
+ if hasattr(layer.self_attn, 'diffusion_lm'):
252
+ layer.self_attn.diffusion_lm=True
253
+
254
+ if dream_style and num_block < num_blocks - 1:
255
+ # refresh context-next logit for the next block
256
+ next_logits_context = output.logits[:, -1:, :] # (B, 1, V)
257
+
258
+ if eos_token_id is not None:
259
+ gen_so_far = x_accum[:, prompt.size(1):] # (B, gen_len_so_far)
260
+ is_eos = (gen_so_far == eos_token_id) # (B, gen_len_so_far)
261
+ has_eos = is_eos.any(dim=1) # (B,)
262
+ if has_eos.all():
263
+ first_eos_pos = is_eos.to(torch.int64).argmax(dim=1) # (B,)
264
+ max_eos = first_eos_pos.max().item()
265
+ return x_accum[:, : prompt.size(1) + max_eos + 1], nfe
266
+
267
+ return x_accum, nfe
config.json CHANGED
@@ -1,21 +1,30 @@
1
  {
 
 
 
 
2
  "ar_loss_weight": 1.0,
3
  "architectures": [
4
- "NemotronLabsDiffusionModel"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "attn_implementation": "sdpa",
9
  "auto_map": {
10
- "AutoConfig": "configuration_nemotron_labs_diffusion.NemotronLabsDiffusionConfig",
11
- "AutoModel": "modeling_nemotron_labs_diffusion.NemotronLabsDiffusionModel"
12
  },
13
  "block_size": 32,
14
  "bos_token_id": 1,
 
 
15
  "dlm_loss_weight": null,
16
  "dlm_paradigm": "bidirectional",
 
17
  "dp_varying_mask_ratio": false,
 
18
  "eos_token_id": 2,
 
19
  "head_dim": 128,
20
  "hidden_act": "silu",
21
  "hidden_size": 3072,
@@ -24,10 +33,16 @@
24
  "mask_token_id": 100,
25
  "max_position_embeddings": 4096,
26
  "mlp_bias": false,
27
- "model_type": "nemotron_labs_diffusion",
 
 
28
  "num_attention_heads": 32,
 
29
  "num_hidden_layers": 26,
30
  "num_key_value_heads": 8,
 
 
 
31
  "rms_norm_eps": 1e-05,
32
  "rope_parameters": {
33
  "beta_fast": 32.0,
@@ -42,6 +57,7 @@
42
  },
43
  "sliding_window": null,
44
  "tie_word_embeddings": false,
 
45
  "torch_dtype": "bfloat16",
46
  "transformers_version": "5.0.0",
47
  "use_cache": false,
 
1
  {
2
+ "ada_dlm_loss_ratio": null,
3
+ "ada_perm_ratio_global": null,
4
+ "ada_perm_ratio_per_block": null,
5
+ "adaptive_mask_rate": false,
6
  "ar_loss_weight": 1.0,
7
  "architectures": [
8
+ "MinistralDiffEncoderModel"
9
  ],
10
  "attention_bias": false,
11
  "attention_dropout": 0.0,
12
  "attn_implementation": "sdpa",
13
  "auto_map": {
14
+ "AutoConfig": "configuration_ministral_dlm.MinistralDLMConfig",
15
+ "AutoModel": "modeling_ministral_dlm.MinistralDiffEncoderModel"
16
  },
17
  "block_size": 32,
18
  "bos_token_id": 1,
19
+ "diff_loss_weight": 1,
20
+ "dlm_arch": "encoder",
21
  "dlm_loss_weight": null,
22
  "dlm_paradigm": "bidirectional",
23
+ "dlm_type": "llada",
24
  "dp_varying_mask_ratio": false,
25
+ "enforce_mask": false,
26
  "eos_token_id": 2,
27
+ "global_loss_avg": false,
28
  "head_dim": 128,
29
  "hidden_act": "silu",
30
  "hidden_size": 3072,
 
33
  "mask_token_id": 100,
34
  "max_position_embeddings": 4096,
35
  "mlp_bias": false,
36
+ "model_type": "ministral_dlm",
37
+ "multi_sampling": null,
38
+ "num_ar_layers": 0,
39
  "num_attention_heads": 32,
40
+ "num_diffusion_layers": 0,
41
  "num_hidden_layers": 26,
42
  "num_key_value_heads": 8,
43
+ "num_skip_loss_tokens": 0,
44
+ "prefix_ratio": 0.8,
45
+ "random_length_prob": 0,
46
  "rms_norm_eps": 1e-05,
47
  "rope_parameters": {
48
  "beta_fast": 32.0,
 
57
  },
58
  "sliding_window": null,
59
  "tie_word_embeddings": false,
60
+ "tok_mask_half_life_ratio": null,
61
  "torch_dtype": "bfloat16",
62
  "transformers_version": "5.0.0",
63
  "use_cache": false,
configuration_nemotron_labs_diffusion.py → configuration_ministral_dlm.py RENAMED
@@ -12,7 +12,7 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
- """Nemotron-Labs Diffusion model configuration"""
16
 
17
  from transformers.configuration_utils import PretrainedConfig
18
  from transformers.modeling_rope_utils import rope_config_validation
@@ -22,10 +22,10 @@ from transformers.utils import logging
22
  logger = logging.get_logger(__name__)
23
 
24
 
25
- class NemotronLabsDiffusionConfig(PretrainedConfig):
26
  r"""
27
- This is the configuration class to store the configuration of a [`NemotronLabsDiffusionModel`] for diffusion language models.
28
- It is used to instantiate a NemotronLabsDiffusionModel according to the specified arguments, defining the model architecture.
29
 
30
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
  documentation from [`PretrainedConfig`] for more information.
@@ -72,19 +72,49 @@ class NemotronLabsDiffusionConfig(PretrainedConfig):
72
  Sliding window attention size.
73
  mask_token_id (`int`, *optional*, defaults to -1):
74
  Token ID for masking in diffusion.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  dlm_paradigm (`str`, *optional*, defaults to 'bidirectional'):
76
- Paradigm for diffusion ('bidirectional', 'autoregressive', 'block_diff').
 
 
77
  block_size (`int`, *optional*, defaults to 32):
78
  Block size for block diffusion paradigms.
 
 
 
 
 
 
 
 
79
  dlm_loss_weight (`float`, *optional*):
80
  Weight for diffusion LM loss.
81
  ar_loss_weight (`float`, *optional*, defaults to 1.0):
82
- Weight for autoregressive loss in block_diff paradigm. Use 10000 to only use AR loss.
 
 
83
  dp_varying_mask_ratio (`bool`, *optional*, defaults to False):
84
  Whether to use varying mask ratio for each DP rank during sampling.
 
 
 
 
85
  """
86
 
87
- model_type = "nemotron_labs_diffusion"
88
  keys_to_ignore_at_inference = ["past_key_values"]
89
 
90
  # Default tensor parallel plan for base model `Ministral`
@@ -129,11 +159,27 @@ class NemotronLabsDiffusionConfig(PretrainedConfig):
129
  sliding_window=None,
130
  attn_implementation="sdpa",
131
  mask_token_id=-1,
 
 
 
 
 
 
 
132
  dlm_paradigm='bidirectional',
 
133
  block_size=32,
 
 
 
 
134
  dlm_loss_weight=None,
135
  ar_loss_weight=1.0,
 
136
  dp_varying_mask_ratio=False,
 
 
 
137
  **kwargs,
138
  ):
139
  self.vocab_size = vocab_size
@@ -168,11 +214,27 @@ class NemotronLabsDiffusionConfig(PretrainedConfig):
168
  self.attn_implementation = attn_implementation
169
 
170
  self.mask_token_id = mask_token_id
 
 
 
 
 
 
 
171
  self.dlm_paradigm = dlm_paradigm
 
172
  self.block_size = block_size
 
 
 
 
173
  self.dlm_loss_weight = dlm_loss_weight
174
  self.ar_loss_weight = ar_loss_weight
 
175
  self.dp_varying_mask_ratio = dp_varying_mask_ratio
 
 
 
176
  super().__init__(
177
  pad_token_id=pad_token_id,
178
  bos_token_id=bos_token_id,
@@ -182,5 +244,5 @@ class NemotronLabsDiffusionConfig(PretrainedConfig):
182
  )
183
 
184
 
185
- __all__ = ["NemotronLabsDiffusionConfig"]
186
 
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
+ """Ministral DLM model configuration"""
16
 
17
  from transformers.configuration_utils import PretrainedConfig
18
  from transformers.modeling_rope_utils import rope_config_validation
 
22
  logger = logging.get_logger(__name__)
23
 
24
 
25
+ class MinistralDLMConfig(PretrainedConfig):
26
  r"""
27
+ This is the configuration class to store the configuration of a [`Ministral3Model`] for diffusion language models.
28
+ It is used to instantiate a Ministral model according to the specified arguments, defining the model architecture.
29
 
30
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
  documentation from [`PretrainedConfig`] for more information.
 
72
  Sliding window attention size.
73
  mask_token_id (`int`, *optional*, defaults to -1):
74
  Token ID for masking in diffusion.
75
+ dlm_type (`str`, *optional*, defaults to 'llada'):
76
+ Type of diffusion language model ('llada', 'dream').
77
+ random_length_prob (`float`, *optional*):
78
+ Probability of using random lengths during training.
79
+ num_ar_layers (`int`, *optional*, defaults to 0):
80
+ Number of autoregressive layers.
81
+ num_diffusion_layers (`int`, *optional*, defaults to 0):
82
+ Number of diffusion layers.
83
+ diff_loss_weight (`float`, *optional*, defaults to 1):
84
+ Weight for diffusion loss.
85
+ enforce_mask (`bool`, *optional*, defaults to False):
86
+ Whether to enforce masking.
87
+ prefix_ratio (`float`, *optional*, defaults to 0.8):
88
+ Ratio for prefix in prefix_bidirectional mode.
89
  dlm_paradigm (`str`, *optional*, defaults to 'bidirectional'):
90
+ Paradigm for diffusion ('bidirectional', 'autoregressive', 'prefix_bidirectional', 'efficient_block_diff', 'block_diff', 'sbd_block_diff').
91
+ dlm_arch (`str`, *optional*, defaults to 'encoder'):
92
+ Architecture type ('encoder', 'encoder_decoder').
93
  block_size (`int`, *optional*, defaults to 32):
94
  Block size for block diffusion paradigms.
95
+ tok_mask_half_life_ratio (`float`, *optional*):
96
+ Half-life ratio for token masking.
97
+ adaptive_mask_rate (`bool`, *optional*, defaults to False):
98
+ Whether to use adaptive mask rate.
99
+ multi_sampling (`int`, *optional*):
100
+ Number of samples for multi-sampling.
101
+ num_skip_loss_tokens (`int`, *optional*, defaults to 0):
102
+ Number of tokens to skip in loss calculation.
103
  dlm_loss_weight (`float`, *optional*):
104
  Weight for diffusion LM loss.
105
  ar_loss_weight (`float`, *optional*, defaults to 1.0):
106
+ Weight for autoregressive loss in sbd_block_diff paradigm. Use 10000 to only use AR loss.
107
+ global_loss_avg (`bool`, *optional*, defaults to False):
108
+ Whether to use global loss average.
109
  dp_varying_mask_ratio (`bool`, *optional*, defaults to False):
110
  Whether to use varying mask ratio for each DP rank during sampling.
111
+ ada_perm_ratio_per_block (`float`, *optional*):
112
+ Adaptive permutation ratio for each block.
113
+ ada_perm_ratio_global (`float`, *optional*):
114
+ Adaptive permutation ratio for global.
115
  """
116
 
117
+ model_type = "ministral_dlm"
118
  keys_to_ignore_at_inference = ["past_key_values"]
119
 
120
  # Default tensor parallel plan for base model `Ministral`
 
159
  sliding_window=None,
160
  attn_implementation="sdpa",
161
  mask_token_id=-1,
162
+ dlm_type='llada',
163
+ random_length_prob=None,
164
+ num_ar_layers=0,
165
+ num_diffusion_layers=0,
166
+ diff_loss_weight=1,
167
+ enforce_mask=False,
168
+ prefix_ratio=0.8,
169
  dlm_paradigm='bidirectional',
170
+ dlm_arch='encoder',
171
  block_size=32,
172
+ tok_mask_half_life_ratio=None,
173
+ adaptive_mask_rate=False,
174
+ multi_sampling=None,
175
+ num_skip_loss_tokens=0,
176
  dlm_loss_weight=None,
177
  ar_loss_weight=1.0,
178
+ global_loss_avg=False,
179
  dp_varying_mask_ratio=False,
180
+ ada_perm_ratio_per_block=None,
181
+ ada_perm_ratio_global=None,
182
+ ada_dlm_loss_ratio=None,
183
  **kwargs,
184
  ):
185
  self.vocab_size = vocab_size
 
214
  self.attn_implementation = attn_implementation
215
 
216
  self.mask_token_id = mask_token_id
217
+ self.dlm_type = dlm_type
218
+ self.random_length_prob = random_length_prob
219
+ self.num_ar_layers = num_ar_layers
220
+ self.num_diffusion_layers = num_diffusion_layers
221
+ self.diff_loss_weight = diff_loss_weight
222
+ self.enforce_mask = enforce_mask
223
+ self.prefix_ratio = prefix_ratio
224
  self.dlm_paradigm = dlm_paradigm
225
+ self.dlm_arch = dlm_arch
226
  self.block_size = block_size
227
+ self.tok_mask_half_life_ratio = tok_mask_half_life_ratio
228
+ self.adaptive_mask_rate = adaptive_mask_rate
229
+ self.multi_sampling = multi_sampling
230
+ self.num_skip_loss_tokens = num_skip_loss_tokens
231
  self.dlm_loss_weight = dlm_loss_weight
232
  self.ar_loss_weight = ar_loss_weight
233
+ self.global_loss_avg = global_loss_avg
234
  self.dp_varying_mask_ratio = dp_varying_mask_ratio
235
+ self.ada_perm_ratio_per_block = ada_perm_ratio_per_block
236
+ self.ada_perm_ratio_global = ada_perm_ratio_global
237
+ self.ada_dlm_loss_ratio = ada_dlm_loss_ratio
238
  super().__init__(
239
  pad_token_id=pad_token_id,
240
  bos_token_id=bos_token_id,
 
244
  )
245
 
246
 
247
+ __all__ = ["MinistralDLMConfig"]
248
 
generation_config.json CHANGED
@@ -2,6 +2,6 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "5.0.0",
6
  "use_cache": false
7
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.55.4",
6
  "use_cache": false
7
  }
model_cards/bias.md DELETED
@@ -1,4 +0,0 @@
1
- Field | Response
2
- :---------------------------------------------------------------------------------------------------|:---------------
3
- Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | [None]
4
- Measures taken to mitigate against unwanted bias: | [None]
 
 
 
 
 
model_cards/explainability.md DELETED
@@ -1,13 +0,0 @@
1
- Field | Response
2
- :------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------
3
- Intended Task/Domain: | Text generation
4
- Model Type: | Transformer
5
- Intended Users: | Generative AI creators working with conversational AI models.
6
- Output: | Text (Responds to posed question, Stateful - remembers previous answers)
7
- Describe how the model works: | Text input is encoded into tokens and passed into a transformer-based language model, which returns a text response.
8
- Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable
9
- Technical Limitations & Mitigation: | The model cannot perform long-horizon reasoning and tool calling.
10
- Verified to have met prescribed NVIDIA quality standards: | Yes
11
- Performance Metrics: | Accuracy, Latency, Throughput
12
- Potential Known Risks: | In some instances, the model may think too long and struggle to derive final answers. The model's output can generate all forms of text, including what may be considered toxic, offensive, or indecent.
13
- Licensing: | nvidia-open-model-license.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_cards/privacy.md DELETED
@@ -1,11 +0,0 @@
1
- Field | Response
2
- :----------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------
3
- Generatable or reverse engineerable personal data? | [No]
4
- Personal data used to create this model? | [No]
5
- Was consent obtained for any personal data used? | [Not Applicable]
6
- How often is dataset reviewed? | [During dataset creation, model training, evaluation, and the prerelease phase.]
7
- Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model? | [Yes]
8
- Is there provenance for all datasets used in training? | Yes
9
- Does data labeling (annotation, metadata) comply with privacy laws? | Yes
10
- Is data compliant with data subject requests for data correction or removal, if such a request was made? | Not Applicable.
11
- Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/
 
 
 
 
 
 
 
 
 
 
 
 
model_cards/safety.md DELETED
@@ -1,6 +0,0 @@
1
- Field | Response
2
- :---------------------------------------------------|:----------------------------------
3
- Model Application Field(s): | [Media & Entertainment].
4
- Describe the life critical impact (if present). | Not Applicable
5
- Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to.
6
- Use Case Restrictions: | Abide by nvidia-open-model-license.
 
 
 
 
 
 
 
modeling_ministral.py CHANGED
@@ -25,7 +25,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
25
  from transformers.processing_utils import Unpack
26
  from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
27
  # from transformers.utils.generic import maybe_autocast
28
- from .configuration_nemotron_labs_diffusion import NemotronLabsDiffusionConfig
29
 
30
  #ALL_MASK_ATTENTION_FUNCTIONS._global_mapping['sdpa'] = sdpa_mask_older_torch
31
 
@@ -110,7 +110,7 @@ def _get_llama_4_attn_scale(positions_ids: torch.Tensor, beta: float, max_positi
110
  class Ministral3Attention(nn.Module):
111
  """Multi-headed attention from 'Attention Is All You Need' paper"""
112
 
113
- def __init__(self, config: NemotronLabsDiffusionConfig, layer_idx: int):
114
  super().__init__()
115
  self.config = config
116
  self.layer_idx = layer_idx
@@ -234,7 +234,7 @@ class Ministral3RMSNorm(nn.Module):
234
 
235
 
236
  class Ministral3DecoderLayer(GradientCheckpointingLayer):
237
- def __init__(self, config: NemotronLabsDiffusionConfig, layer_idx: int):
238
  super().__init__()
239
  self.hidden_size = config.hidden_size
240
 
@@ -284,7 +284,7 @@ class Ministral3DecoderLayer(GradientCheckpointingLayer):
284
 
285
  @auto_docstring
286
  class Ministral3PreTrainedModel(PreTrainedModel):
287
- config: NemotronLabsDiffusionConfig
288
  base_model_prefix = "model"
289
  supports_gradient_checkpointing = True
290
  _no_split_modules = ["Ministral3DecoderLayer"]
@@ -304,7 +304,7 @@ class Ministral3PreTrainedModel(PreTrainedModel):
304
  class Ministral3RotaryEmbedding(nn.Module):
305
  inv_freq: torch.Tensor # fix linting for `register_buffer`
306
 
307
- def __init__(self, config: NemotronLabsDiffusionConfig, device=None):
308
  super().__init__()
309
  self.max_seq_len_cached = config.max_position_embeddings
310
  self.original_max_seq_len = config.max_position_embeddings
@@ -323,7 +323,7 @@ class Ministral3RotaryEmbedding(nn.Module):
323
 
324
  @staticmethod
325
  def compute_default_rope_parameters(
326
- config: Optional[NemotronLabsDiffusionConfig] = None,
327
  device: Optional["torch.device"] = None,
328
  seq_len: Optional[int] = None,
329
  ) -> tuple["torch.Tensor", float]:
@@ -370,7 +370,7 @@ class Ministral3RotaryEmbedding(nn.Module):
370
 
371
  @auto_docstring
372
  class Ministral3Model(Ministral3PreTrainedModel):
373
- def __init__(self, config: NemotronLabsDiffusionConfig):
374
  super().__init__(config)
375
  self.padding_idx = config.pad_token_id
376
  self.vocab_size = config.vocab_size
@@ -453,7 +453,99 @@ class Ministral3Model(Ministral3PreTrainedModel):
453
  )
454
 
455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  __all__ = [
 
 
457
  "Ministral3Model",
458
  "Ministral3PreTrainedModel",
 
 
459
  ]
 
25
  from transformers.processing_utils import Unpack
26
  from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
27
  # from transformers.utils.generic import maybe_autocast
28
+ from .configuration_ministral_dlm import MinistralDLMConfig
29
 
30
  #ALL_MASK_ATTENTION_FUNCTIONS._global_mapping['sdpa'] = sdpa_mask_older_torch
31
 
 
110
  class Ministral3Attention(nn.Module):
111
  """Multi-headed attention from 'Attention Is All You Need' paper"""
112
 
113
+ def __init__(self, config: MinistralDLMConfig, layer_idx: int):
114
  super().__init__()
115
  self.config = config
116
  self.layer_idx = layer_idx
 
234
 
235
 
236
  class Ministral3DecoderLayer(GradientCheckpointingLayer):
237
+ def __init__(self, config: MinistralDLMConfig, layer_idx: int):
238
  super().__init__()
239
  self.hidden_size = config.hidden_size
240
 
 
284
 
285
  @auto_docstring
286
  class Ministral3PreTrainedModel(PreTrainedModel):
287
+ config: MinistralDLMConfig
288
  base_model_prefix = "model"
289
  supports_gradient_checkpointing = True
290
  _no_split_modules = ["Ministral3DecoderLayer"]
 
304
  class Ministral3RotaryEmbedding(nn.Module):
305
  inv_freq: torch.Tensor # fix linting for `register_buffer`
306
 
307
+ def __init__(self, config: MinistralDLMConfig, device=None):
308
  super().__init__()
309
  self.max_seq_len_cached = config.max_position_embeddings
310
  self.original_max_seq_len = config.max_position_embeddings
 
323
 
324
  @staticmethod
325
  def compute_default_rope_parameters(
326
+ config: Optional[MinistralDLMConfig] = None,
327
  device: Optional["torch.device"] = None,
328
  seq_len: Optional[int] = None,
329
  ) -> tuple["torch.Tensor", float]:
 
370
 
371
  @auto_docstring
372
  class Ministral3Model(Ministral3PreTrainedModel):
373
+ def __init__(self, config: MinistralDLMConfig):
374
  super().__init__(config)
375
  self.padding_idx = config.pad_token_id
376
  self.vocab_size = config.vocab_size
 
453
  )
454
 
455
 
456
+ @auto_docstring
457
+ class Ministral3ForCausalLM(Ministral3PreTrainedModel, GenerationMixin):
458
+ _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
459
+ _tp_plan = {"lm_head": "colwise_rep"}
460
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
461
+
462
+ def __init__(self, config):
463
+ super().__init__(config)
464
+ self.model = Ministral3Model(config)
465
+ self.vocab_size = config.vocab_size
466
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
467
+
468
+ # Initialize weights and apply final processing
469
+ self.post_init()
470
+
471
+ @can_return_tuple
472
+ @auto_docstring
473
+ def forward(
474
+ self,
475
+ input_ids: Optional[torch.LongTensor] = None,
476
+ attention_mask: Optional[torch.Tensor] = None,
477
+ position_ids: Optional[torch.LongTensor] = None,
478
+ past_key_values: Optional[Cache] = None,
479
+ inputs_embeds: Optional[torch.FloatTensor] = None,
480
+ labels: Optional[torch.LongTensor] = None,
481
+ use_cache: Optional[bool] = None,
482
+ cache_position: Optional[torch.LongTensor] = None,
483
+ logits_to_keep: Union[int, torch.Tensor] = 0,
484
+ **kwargs: Unpack[TransformersKwargs],
485
+ ) -> CausalLMOutputWithPast:
486
+ r"""
487
+ Example:
488
+
489
+ ```python
490
+ >>> from transformers import AutoTokenizer, Ministral3ForCausalLM
491
+
492
+ >>> model = Ministral3ForCausalLM.from_pretrained("meta-ministral3/Ministral3-2-7b-hf")
493
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-ministral3/Ministral3-2-7b-hf")
494
+
495
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
496
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
497
+
498
+ >>> # Generate
499
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
500
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
501
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
502
+ ```"""
503
+ outputs: BaseModelOutputWithPast = self.model(
504
+ input_ids=input_ids,
505
+ attention_mask=attention_mask,
506
+ position_ids=position_ids,
507
+ past_key_values=past_key_values,
508
+ inputs_embeds=inputs_embeds,
509
+ use_cache=use_cache,
510
+ cache_position=cache_position,
511
+ **kwargs,
512
+ )
513
+
514
+ hidden_states = outputs.last_hidden_state
515
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
516
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
517
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
518
+
519
+ loss = None
520
+ if labels is not None:
521
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
522
+
523
+ return CausalLMOutputWithPast(
524
+ loss=loss,
525
+ logits=logits,
526
+ past_key_values=outputs.past_key_values,
527
+ hidden_states=outputs.hidden_states,
528
+ attentions=outputs.attentions,
529
+ )
530
+
531
+
532
+ class Ministral3ForTokenClassification(GenericForTokenClassification, Ministral3PreTrainedModel):
533
+ pass
534
+
535
+
536
+ class Ministral3ForSequenceClassification(GenericForSequenceClassification, Ministral3PreTrainedModel):
537
+ pass
538
+
539
+
540
+ class Ministral3ForQuestionAnswering(GenericForQuestionAnswering, Ministral3PreTrainedModel):
541
+ pass
542
+
543
+
544
  __all__ = [
545
+ "Ministral3ForCausalLM",
546
+ "Ministral3ForQuestionAnswering",
547
  "Ministral3Model",
548
  "Ministral3PreTrainedModel",
549
+ "Ministral3ForSequenceClassification",
550
+ "Ministral3ForTokenClassification",
551
  ]
modeling_ministral_dlm.py ADDED
@@ -0,0 +1,1112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Optional, Tuple, Union
4
+ import random
5
+ import os
6
+ import sys
7
+ import json
8
+ import numpy as np
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch import nn
13
+ from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutput
14
+ from transformers.utils import ModelOutput
15
+
16
+ from torch.nn.attention.flex_attention import BlockMask, flex_attention, create_block_mask, or_masks
17
+
18
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
19
+
20
+ from transformers.processing_utils import Unpack
21
+
22
+ from transformers.cache_utils import Cache, DynamicCache
23
+
24
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.generation import GenerationMixin
27
+
28
+ import math
29
+
30
+ from .chat_utils import generate_with_prefix_cache_block_diff
31
+ from .modeling_ministral import Ministral3Model, Ministral3PreTrainedModel, Ministral3Attention, apply_rotary_pos_emb, repeat_kv, _get_llama_4_attn_scale
32
+ from .configuration_ministral_dlm import MinistralDLMConfig
33
+
34
+
35
+ @dataclass
36
+ class MinistralDiffOutputWithPast(ModelOutput):
37
+ loss: torch.FloatTensor | None = None
38
+ logits: torch.FloatTensor | None = None
39
+ causal_logits: torch.FloatTensor | None = None
40
+ past_key_values: Cache | None = None
41
+ hidden_states: tuple[torch.FloatTensor, ...] | None = None
42
+ attentions: tuple[torch.FloatTensor, ...] | None = None
43
+
44
+
45
+ # @torch.compile(dynamic=True, mode="reduce-overhead")
46
+ # @torch.compile(mode="default")
47
+ # @torch.compile(fullgraph=True, mode="reduce-overhead", dynamic=False)
48
+ @torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs", dynamic=False)
49
+ def fused_flex_attention(q, k, v, block_mask=None):
50
+ return flex_attention(q, k, v, block_mask=block_mask)
51
+
52
+
53
+ def _crop_dynamic_cache(past_key_values: DynamicCache, max_length: int):
54
+ """Crop a DynamicCache to max_length, compatible with both old and new transformers."""
55
+ if hasattr(past_key_values, 'crop'):
56
+ past_key_values.crop(max_length)
57
+ else:
58
+ for layer_idx in range(len(past_key_values)):
59
+ past_key_values.key_cache[layer_idx] = past_key_values.key_cache[layer_idx][:, :, :max_length]
60
+ past_key_values.value_cache[layer_idx] = past_key_values.value_cache[layer_idx][:, :, :max_length]
61
+ past_key_values._seen_tokens = max_length
62
+
63
+
64
+ def _extract_draft_kv_cache(past_key_values: DynamicCache, clean_len: int, block_length: int):
65
+ """After quadratic decoding, extract only draft tokens (first of each block) from cache."""
66
+ for layer_idx in range(len(past_key_values)):
67
+ if hasattr(past_key_values, 'layers'):
68
+ layer_cache = past_key_values.layers[layer_idx]
69
+ k, v = layer_cache.keys, layer_cache.values
70
+ else:
71
+ k = past_key_values.key_cache[layer_idx]
72
+ v = past_key_values.value_cache[layer_idx]
73
+
74
+ clean_k, draft_k = k[:, :, :clean_len], k[:, :, clean_len::block_length + 1]
75
+ clean_v, draft_v = v[:, :, :clean_len], v[:, :, clean_len::block_length + 1]
76
+ new_k = torch.cat([clean_k, draft_k], dim=2)
77
+ new_v = torch.cat([clean_v, draft_v], dim=2)
78
+
79
+ if hasattr(past_key_values, 'layers'):
80
+ layer_cache.keys = new_k
81
+ layer_cache.values = new_v
82
+ else:
83
+ past_key_values.key_cache[layer_idx] = new_k
84
+ past_key_values.value_cache[layer_idx] = new_v
85
+
86
+ past_key_values._seen_tokens = clean_len + block_length
87
+
88
+
89
+ # with reference to https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
90
+ class MinistralFlexAttention(Ministral3Attention):
91
+ def __init__(self, *args, **kwargs):
92
+ super().__init__(*args, **kwargs)
93
+
94
+ self.block_size_orig = self.config.block_size
95
+
96
+ if self.config.dlm_paradigm == 'bidirectional':
97
+ self.bidirectional_mask = self.compute_block_mask(mode='bidirectional')
98
+ elif self.config.dlm_paradigm == 'autoregressive':
99
+ self.autoregressive_mask = self.compute_block_mask(mode='autoregressive')
100
+ elif self.config.dlm_paradigm == 'block_diff':
101
+ self.block_diff_mask = None
102
+ elif self.config.dlm_paradigm == 'sbd_block_diff':
103
+ self.sbd_block_diff_mask = None
104
+ else:
105
+ raise ValueError(f"Unknown attention mode: {self.config.dlm_paradigm}")
106
+
107
+ self.block_size = self.block_size_orig
108
+ self.mode = self.config.dlm_paradigm
109
+ self._quadratic_block_mask = {}
110
+
111
+ import torch._dynamo.config as dcfg
112
+ dcfg.cache_size_limit = 512
113
+
114
+
115
+ def _get_sbd_inference_quadratic_decoding_block_mask(self, block_length: int):
116
+ if block_length not in self._quadratic_block_mask:
117
+ draft_len = block_length * (block_length + 1)
118
+
119
+ def quadratic(b, h, q_idx, kv_idx):
120
+ first_clean = torch.logical_and(
121
+ kv_idx % (block_length + 1) == 0,
122
+ kv_idx < draft_len,
123
+ )
124
+ first_clean = torch.logical_and(first_clean, q_idx >= kv_idx)
125
+ block_q = q_idx // (block_length + 1)
126
+ block_kv = kv_idx // (block_length + 1)
127
+ same_block = torch.logical_and(block_q == block_kv, q_idx < draft_len)
128
+ same_block_except_first = torch.logical_and(
129
+ same_block,
130
+ q_idx % (block_length + 1) != 0,
131
+ )
132
+ draft_part = torch.logical_or(first_clean, same_block_except_first)
133
+ clean_part = kv_idx >= draft_len
134
+ return torch.logical_or(draft_part, clean_part)
135
+
136
+ block_mask = create_block_mask(
137
+ quadratic,
138
+ B=None,
139
+ H=None,
140
+ Q_LEN=draft_len,
141
+ KV_LEN=draft_len + self.config.max_position_embeddings,
142
+ device="cuda",
143
+ )
144
+
145
+ self._quadratic_block_mask[block_length] = block_mask
146
+
147
+ return self._quadratic_block_mask[block_length]
148
+
149
+
150
+ def set_attention_mode(self, mode, block_size=None):
151
+ self.mode = mode
152
+ self.block_size = block_size
153
+
154
+ def compute_block_mask(self, mode, q_len, block_size=None):
155
+
156
+ def bidirectional_mask(b, h, q, kv):
157
+ return (q >= kv) | (q < kv)
158
+
159
+ def autoregressive_mask(b, h, q, kv):
160
+ return (q >= kv)
161
+
162
+ def block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
163
+ """
164
+ Constructs the specialized block diffusion attention mask for training
165
+ composed of three masks:
166
+ - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
167
+ - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
168
+ - **Block Causal Mask (M_BC)**: Attention to update x0
169
+ Args:
170
+ b, h: Batch and head indices (ignored for mask logic).
171
+ q_idx, kv_idx: Query and Key indices.
172
+ seq_len: Total sequence length.
173
+ block_size: Defines the block structure.
174
+ Returns:
175
+ A boolean attention mask.
176
+ """
177
+
178
+ # Indicate whether token belongs to xt or x0
179
+ x0_flag_q = (q_idx >= n)
180
+ x0_flag_kv = (kv_idx >= n)
181
+
182
+ # Compute block indices
183
+ block_q = torch.where(x0_flag_q == 1,
184
+ (q_idx - n) // block_size,
185
+ q_idx // block_size)
186
+ block_kv = torch.where(x0_flag_kv == 1,
187
+ (kv_idx - n) // block_size,
188
+ kv_idx // block_size)
189
+
190
+ # **1. Block Diagonal Mask (M_BD) **
191
+ block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
192
+
193
+ # **2. Offset Block-Causal Mask (M_OBC) **
194
+ offset_block_causal = (
195
+ (block_q > block_kv)
196
+ & (x0_flag_kv == 1)
197
+ & (x0_flag_q == 0)
198
+ )
199
+
200
+ # **3. Block-Causal Mask (M_BC) **
201
+ block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)
202
+
203
+ # **4. Combine Masks **
204
+ return block_diagonal | offset_block_causal | block_causal
205
+
206
+
207
+ def sbd_block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
208
+ x0_flag_q = (q_idx >= n)
209
+ x0_flag_kv = (kv_idx >= n)
210
+
211
+ # Compute block indices
212
+ block_q = torch.where(x0_flag_q == 1,
213
+ (q_idx - n) // block_size,
214
+ q_idx // block_size)
215
+ block_kv = torch.where(x0_flag_kv == 1,
216
+ (kv_idx - n) // block_size,
217
+ kv_idx // block_size)
218
+
219
+ # **1. Block Diagonal Mask (M_BD) **
220
+ block_diagonal = (block_q == block_kv) & (x0_flag_kv == 0) & (x0_flag_q == 0)
221
+
222
+ # **2. Offset Block-Causal Mask (M_OBC) **
223
+ offset_block_causal = (
224
+ (block_q > block_kv)
225
+ & (x0_flag_kv == 1)
226
+ & (x0_flag_q == 0)
227
+ )
228
+
229
+ # **3. Fully Causal Mask (M_BC) **
230
+ fully_causal = (q_idx >= kv_idx) & (x0_flag_kv == 1) & (x0_flag_q == 1)
231
+
232
+ # **4. Combine Masks **
233
+ return block_diagonal | offset_block_causal | fully_causal
234
+
235
+ if mode == 'bidirectional':
236
+ attn_mask = bidirectional_mask
237
+ elif mode == 'autoregressive':
238
+ attn_mask = autoregressive_mask
239
+ elif mode == 'block_diff':
240
+ assert block_size is not None
241
+ attn_mask = lambda b, h, q, kv: block_diff_mask(block_size, b, h, q, kv, q_len//2)
242
+ elif mode == 'sbd_block_diff':
243
+ assert block_size is not None
244
+ attn_mask = lambda b, h, q, kv: sbd_block_diff_mask(block_size, b, h, q, kv, q_len//2)
245
+ else:
246
+ raise ValueError(f"Unknown attention mode: {mode}")
247
+
248
+ block_mask = create_block_mask(
249
+ attn_mask, B=None, H=None, Q_LEN=q_len, KV_LEN=q_len
250
+ )
251
+
252
+ return block_mask
253
+
254
+
255
+ def forward(
256
+ self,
257
+ hidden_states: torch.Tensor,
258
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
259
+ attention_mask: Optional[torch.Tensor],
260
+ past_key_values: Optional[Cache] = None,
261
+ cache_position: Optional[torch.LongTensor] = None,
262
+ is_training: bool = True,
263
+ **kwargs: Unpack[FlashAttentionKwargs],
264
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
265
+ bsz, q_len, _ = hidden_states.size()
266
+ input_shape = hidden_states.shape[:-1]
267
+ hidden_shape = (*input_shape, -1, self.head_dim)
268
+
269
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
270
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
271
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
272
+
273
+ cos, sin = position_embeddings
274
+
275
+ if self.mode in ['block_diff', 'sbd_block_diff'] and is_training:
276
+ # Split query and key states in half along sequence length dimension
277
+ q1, q2 = query_states.chunk(2, dim=2)
278
+ k1, k2 = key_states.chunk(2, dim=2)
279
+
280
+ # Apply RoPE independently to each half
281
+ q1, k1 = apply_rotary_pos_emb(q1, k1, cos, sin)
282
+ q2, k2 = apply_rotary_pos_emb(q2, k2, cos, sin)
283
+
284
+ # Recombine the halves
285
+ query_states = torch.cat([q1, q2], dim=2)
286
+ key_states = torch.cat([k1, k2], dim=2)
287
+ else:
288
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
289
+
290
+ query_states = query_states * _get_llama_4_attn_scale(
291
+ cache_position,
292
+ self.config.rope_parameters.get("llama_4_scaling_beta"),
293
+ self.config.rope_parameters.get("original_max_position_embeddings"),
294
+ ).to(query_states.dtype)
295
+
296
+ if past_key_values is not None:
297
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
298
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
299
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
300
+
301
+ tidar_inference_mode = getattr(self.config, "tidar_inference_mode", None)
302
+ if tidar_inference_mode is not None:
303
+ if tidar_inference_mode == "quadratic":
304
+ block_length = getattr(self.config, "block_length", None) or getattr(self.config, "block_size", None)
305
+ if block_length is None:
306
+ raise ValueError("SBD quadratic decoding requires block_length in config.")
307
+ if past_key_values is not None:
308
+ seq_len = key_states.shape[2]
309
+ draft_len = block_length * (block_length + 1)
310
+
311
+ clean_keys = key_states[:, :, :-draft_len]
312
+ draft_keys = key_states[:, :, -draft_len:]
313
+ clean_values = value_states[:, :, :-draft_len]
314
+ draft_values = value_states[:, :, -draft_len:]
315
+ key_states = torch.cat([draft_keys, clean_keys], dim=2)
316
+ value_states = torch.cat([draft_values, clean_values], dim=2)
317
+
318
+ block_mask: BlockMask = self._get_sbd_inference_quadratic_decoding_block_mask(
319
+ block_length=block_length
320
+ )
321
+ block_mask.seq_lengths = (draft_len, seq_len)
322
+ else:
323
+ seq_len = query_states.shape[2]
324
+ draft_len = block_length * (block_length + 1)
325
+ clean_len = seq_len - draft_len
326
+
327
+ def _causal_mask(b, h, q_idx, kv_idx):
328
+ return torch.logical_and(q_idx >= kv_idx, q_idx < clean_len)
329
+
330
+ def _draft2clean_mask(b, h, q_idx, kv_idx):
331
+ full_clean = torch.logical_and(q_idx >= clean_len, kv_idx <= clean_len)
332
+ first_clean = torch.logical_and(
333
+ q_idx >= clean_len, (kv_idx - clean_len) % (block_length + 1) == 0
334
+ )
335
+ first_clean = torch.logical_and(first_clean, q_idx >= kv_idx)
336
+ return torch.logical_or(full_clean, first_clean)
337
+
338
+ def _draft_mask(b, h, q_idx, kv_idx):
339
+ block_q = (q_idx - clean_len) // (block_length + 1)
340
+ block_kv = (kv_idx - clean_len) // (block_length + 1)
341
+ quadrant = torch.logical_and(q_idx >= clean_len, kv_idx >= clean_len)
342
+ same_block = torch.logical_and(block_q == block_kv, quadrant)
343
+ same_block_except_first = torch.logical_and(
344
+ same_block,
345
+ (q_idx - clean_len) % (block_length + 1) != 0,
346
+ )
347
+ return torch.logical_and(block_q == block_kv, same_block_except_first)
348
+
349
+ mask = or_masks(_causal_mask, _draft2clean_mask)
350
+ mask = or_masks(mask, _draft_mask)
351
+
352
+ block_mask = create_block_mask(
353
+ mask, B=None, H=None, Q_LEN=seq_len, KV_LEN=seq_len,
354
+ )
355
+
356
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
357
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
358
+ attn_output = flex_attention(query_states, key_states, value_states, block_mask=block_mask)
359
+ attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
360
+ attn_output = self.o_proj(attn_output)
361
+ return attn_output, None
362
+
363
+ elif tidar_inference_mode == "default":
364
+ block_length = getattr(self.config, "block_length", None) or getattr(self.config, "block_size", None)
365
+ if block_length is None:
366
+ raise ValueError("SBD default decoding requires block_length in config.")
367
+ seq_len = query_states.shape[2]
368
+ prefix_len = seq_len - block_length
369
+
370
+ def _clean_q_mask(b, h, q_idx, kv_idx):
371
+ return torch.logical_and(q_idx >= kv_idx, q_idx < prefix_len)
372
+
373
+ def _noisy_q_mask(b, h, q_idx, kv_idx):
374
+ return q_idx >= prefix_len
375
+
376
+ block_mask = create_block_mask(
377
+ or_masks(_clean_q_mask, _noisy_q_mask),
378
+ B=None,
379
+ H=None,
380
+ Q_LEN=seq_len,
381
+ KV_LEN=seq_len,
382
+ )
383
+
384
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
385
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
386
+ attn_output = flex_attention(query_states, key_states, value_states, block_mask=block_mask)
387
+ attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
388
+ attn_output = self.o_proj(attn_output)
389
+ return attn_output, None
390
+
391
+ else:
392
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
393
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
394
+
395
+ if self.mode == 'bidirectional':
396
+ if self.bidirectional_mask is None or q_len != self.bidirectional_mask.shape[-2]:
397
+ block_mask = self.compute_block_mask(mode='bidirectional', q_len=q_len)
398
+ else:
399
+ block_mask = self.bidirectional_mask
400
+
401
+ elif self.mode == 'autoregressive':
402
+ if self.autoregressive_mask is None or q_len != self.autoregressive_mask.shape[-2]:
403
+ block_mask = self.compute_block_mask(mode='autoregressive', q_len=q_len)
404
+ else:
405
+ block_mask = self.autoregressive_mask
406
+
407
+ elif self.mode == 'block_diff':
408
+ if self.block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
409
+ block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
410
+ else:
411
+ block_mask = self.block_diff_mask
412
+ elif self.mode == 'sbd_block_diff':
413
+ if self.sbd_block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.sbd_block_diff_mask.shape[-2]:
414
+ block_mask = self.compute_block_mask(mode='sbd_block_diff', block_size=self.block_size, q_len=q_len)
415
+ else:
416
+ block_mask = self.sbd_block_diff_mask
417
+ else:
418
+ raise ValueError(f"Unknown attention mode: {self.mode}")
419
+
420
+ attn_output = fused_flex_attention(query_states, key_states, value_states, block_mask=block_mask)
421
+ attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
422
+
423
+ attn_output = self.o_proj(attn_output)
424
+
425
+ return attn_output, None
426
+
427
+
428
+ def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
429
+ """Return a Bool mask of length len(log_w) with exactly k True."""
430
+ g = -torch.log(-torch.log(torch.rand_like(log_w) + 1e-9) + 1e-9)
431
+ topk = torch.topk(log_w + g, k).indices
432
+ mask = torch.zeros_like(log_w, dtype=torch.bool)
433
+ mask[topk] = True
434
+ return mask
435
+
436
+
437
+ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
438
+ """
439
+ A single model with:
440
+ - a bidirectional encoder + diffusion‐LM head over A
441
+ - a causal decoder + LM head over B, conditioned on F_A
442
+ """
443
+
444
+ def __init__(self, config: MinistralDLMConfig):
445
+ super().__init__(config)
446
+
447
+ self.mask_token_id = config.mask_token_id
448
+
449
+ diffusion_config = copy.deepcopy(config)
450
+ diffusion_config.diffusion_lm = True
451
+
452
+ if config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
453
+ diffusion_config.attn_class = MinistralFlexAttention
454
+ elif config.dlm_paradigm in ['bidirectional', 'autoregressive']:
455
+ diffusion_config.attn_class = Ministral3Attention
456
+
457
+ if config.dlm_paradigm == 'autoregressive':
458
+ diffusion_config.diffusion_lm = False
459
+ else:
460
+ raise ValueError(f"Unsupported DLM paradigm: {config.dlm_paradigm}")
461
+
462
+ self.encoder = Ministral3Model(diffusion_config)
463
+ self.diffusion_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
464
+ self.vocab_size = config.vocab_size
465
+
466
+ self.current_iter_ratio = None
467
+
468
+ self.post_init()
469
+
470
+
471
+ def get_input_embeddings(self):
472
+ return self.encoder.embed_tokens
473
+
474
+ def set_input_embeddings(self, value):
475
+ self.encoder.embed_tokens = value
476
+
477
+ def get_output_embeddings(self):
478
+ return self.diffusion_head
479
+
480
+ def set_output_embeddings(self, new_embeddings):
481
+ self.diffusion_head = new_embeddings
482
+
483
+
484
+ def forward_process(self, input_ids, eps=1e-3, block_size=None, loss_mask=None):
485
+ b, l = input_ids.shape
486
+ device = input_ids.device
487
+
488
+ if self.config.dp_varying_mask_ratio:
489
+ # Enable different random seeds for each DP rank during sampling
490
+ import torch.distributed as dist
491
+ dp_rank = 0
492
+ if dist.is_initialized():
493
+ try:
494
+ dp_rank = dist.get_rank()
495
+ except Exception:
496
+ dp_rank = 0
497
+ # Use a local generator to avoid affecting global RNG state
498
+ generator = torch.Generator(device=device)
499
+ generator.manual_seed(torch.seed() + dp_rank)
500
+ else:
501
+ generator = None
502
+
503
+ if self.config.adaptive_mask_rate:
504
+ assert block_size is not None
505
+
506
+ # --- simple linear window mapping ---
507
+ bs_min = getattr(self.config, "t_bs_min", 16)
508
+ bs_max = getattr(self.config, "t_bs_max", 128)
509
+ w = getattr(self.config, "t_window_width", 0.6) # fixed width
510
+
511
+ # fraction in [0,1] (unclamped first)
512
+ frac = (float(block_size) - float(bs_min)) / max(1.0, float(bs_max - bs_min))
513
+ # upper bound decreases linearly from 1.0 -> 0.5
514
+ u_max = 1.0 - w * frac
515
+ # clamp to [0.6, 1.0] to handle bs outside [bs_min, bs_max]
516
+ u_max = max(0.6, min(1.0, u_max))
517
+ u_min = u_max - w # ensures width = w
518
+
519
+ # sample t ~ Uniform(u_min, u_max)
520
+ t = u_min + (u_max - u_min) * torch.rand(b, device=device, generator=generator)
521
+ else:
522
+ t = torch.rand(b, device=device, generator=generator)
523
+
524
+ p_mask = (1 - eps) * t + eps # shape: (b,)
525
+ p_mask = p_mask[:, None].expand(-1, l) # shape: (b, l)
526
+
527
+ masked_indices = torch.rand((b, l), device=device) < p_mask
528
+
529
+ if loss_mask is not None:
530
+ masked_indices[loss_mask == 0] = 0
531
+
532
+ noisy_batch = torch.where(masked_indices, self.mask_token_id, input_ids)
533
+
534
+ return noisy_batch, masked_indices, p_mask
535
+
536
+
537
+ def forward_process_exp(
538
+ self,
539
+ input_ids: torch.Tensor,
540
+ eps: float = 1e-3,
541
+ block_size: int | None = None,
542
+ half_life_ratio: float = 0.25, # λ = ln 2 / (half_life_ratio·L)
543
+ loss_mask: Optional[torch.Tensor] = None,
544
+ ):
545
+ """
546
+ Two-stage corruption with optional per-block sampling.
547
+ • Stage 1: m ~ U(eps, 1) → k = round(m · len) (exact budget).
548
+ • Stage 2: sample exactly k positions with weights
549
+ w_i(m) = exp[ λ · (1−m) · i ] (late-heavy when m→0,
550
+ uniform when m→1).
551
+ If `block_size` is given, the procedure is run *independently*
552
+ inside each contiguous block of that length (last block may be shorter).
553
+ When block_size is provided, m is sampled per-block and p_mask is per-block.
554
+ Args
555
+ ----
556
+ input_ids : (B, L) LongTensor
557
+ eps : minimum corruption ratio
558
+ block_size: if not None, operate block-wise with per-block m sampling
559
+ half_life_ratio : controls steepness when m→0
560
+ """
561
+ B, L = input_ids.shape
562
+ device = input_ids.device
563
+ dtype = torch.float32
564
+
565
+ masked_indices = torch.zeros((B, L), dtype=torch.bool, device=device)
566
+ p_mask = torch.zeros((B, L), dtype=dtype, device=device)
567
+
568
+ # ---------- Stage 1 & 2: whole-sentence or block-wise -------------------
569
+ for b in range(B):
570
+ if block_size is None:
571
+ # ---------- Per-batch sampling (original behavior) ----------
572
+ m = eps + (1.0 - eps) * torch.rand(1, device=device).item() # scalar
573
+ k_tot = int(round(m * L))
574
+ k_tot = max(1, min(k_tot, L)) # clamp to [1, L]
575
+
576
+ # Fill p_mask for this batch
577
+ p_mask[b, :] = m
578
+
579
+ slope = 1.0 - m # ∈ [0,1]; 0 ⇒ uniform, 1 ⇒ late-heavy
580
+
581
+ # ------- single pool over the whole sentence -------------
582
+ lam_base = math.log(2.0) / (half_life_ratio * L) # base decay rate (λ when slope=1)
583
+
584
+ pos = torch.arange(L, device=device, dtype=dtype)
585
+ log_w = (lam_base * slope * pos).clone()
586
+
587
+ masked_indices[b] = gumbel_topk(log_w, k_tot)
588
+
589
+ else:
590
+ # ---------- Per-block sampling ----------
591
+ num_blocks = math.ceil(L / block_size)
592
+ lam_base = math.log(2.0) / (half_life_ratio * block_size) # base decay rate (λ when slope=1)
593
+
594
+ for blk in range(num_blocks):
595
+ start = blk * block_size
596
+ end = min((blk + 1) * block_size, L)
597
+ blk_len = end - start
598
+
599
+ # Sample m per block
600
+ m_blk = eps + (1.0 - eps) * torch.rand(1, device=device).item()
601
+
602
+ # Fill p_mask for this block
603
+ p_mask[b, start:end] = m_blk
604
+
605
+ # per-block budget
606
+ k_blk = int(round(m_blk * blk_len))
607
+ k_blk = max(0, min(k_blk, blk_len))
608
+ if k_blk == 0:
609
+ continue
610
+
611
+ slope = 1.0 - m_blk # ∈ [0,1]; 0 ⇒ uniform, 1 ⇒ late-heavy
612
+
613
+ pos = torch.arange(blk_len, device=device, dtype=dtype)
614
+ log_w = lam_base * slope * pos
615
+
616
+ blk_mask = gumbel_topk(log_w, k_blk)
617
+ masked_indices[b, start:end] = blk_mask
618
+
619
+ if loss_mask is not None:
620
+ masked_indices[loss_mask == 0] = 0
621
+
622
+ noisy_batch = torch.where(masked_indices, self.mask_token_id, input_ids)
623
+ return noisy_batch, masked_indices, p_mask
624
+
625
+
626
+ def forward(
627
+ self,
628
+ input_ids: torch.LongTensor,
629
+ attention_mask: Optional[torch.Tensor] = None,
630
+ position_ids: Optional[torch.LongTensor] = None,
631
+ labels: Optional[torch.LongTensor] = None,
632
+ split_len: Optional[int] = None,
633
+ past_key_values: Optional[Cache] = None,
634
+ block_size: Optional[int] = None,
635
+ block_diff_ppl: bool = False,
636
+ eps: float = 1e-3,
637
+ is_teacher: bool = False,
638
+ masked_indices: Optional[torch.Tensor] = None,
639
+ p_mask: Optional[torch.Tensor] = None,
640
+ teacher_logits: Optional[torch.Tensor] = None,
641
+ masked_indices_teacher: Optional[torch.Tensor] = None,
642
+ loss_mask: Optional[torch.Tensor] = None,
643
+ ce_loss_weight: float = 1.0,
644
+ output_last_hidden_states_only: bool = False,
645
+ skip_loss: bool = False,
646
+ **kwargs,
647
+ ) -> CausalLMOutputWithPast:
648
+
649
+ batch_size, seq_len = input_ids.shape
650
+
651
+ if self.config.dlm_paradigm == 'bidirectional' or self.config.dlm_paradigm == 'autoregressive':
652
+ if labels is not None and torch.rand(1) < self.config.random_length_prob:
653
+ random_length = torch.randint(2, input_ids.shape[1] + 1, (1,))
654
+ input_ids = input_ids[:, :random_length]
655
+ labels = labels[:, :random_length]
656
+
657
+ if attention_mask is not None:
658
+ attention_mask = attention_mask[:, :random_length]
659
+ if position_ids is not None:
660
+ position_ids = position_ids[:, :random_length]
661
+ if loss_mask is not None:
662
+ loss_mask = loss_mask[:, :random_length]
663
+
664
+ elif self.config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
665
+ if labels is not None and block_size is None:
666
+ if torch.rand(1) < self.config.random_length_prob:
667
+ block_size = torch.randint(1, 8, (1,)).item() * 4 ## [4, 32] divisible by 4
668
+ else:
669
+ block_size = self.config.block_size
670
+
671
+ else:
672
+ raise ValueError(f"Unknown dLM paradigm: {self.config.dlm_paradigm}")
673
+
674
+ if labels is not None and self.config.dlm_paradigm != 'autoregressive':
675
+ if masked_indices is not None:
676
+ # assert p_mask is not None
677
+
678
+ if loss_mask is not None:
679
+ masked_indices[loss_mask == 0] = 0
680
+
681
+ noisy_inputs = torch.where(masked_indices, self.mask_token_id, input_ids)
682
+
683
+ else:
684
+ if self.config.tok_mask_half_life_ratio is not None:
685
+ noisy_inputs, masked_indices, p_mask = self.forward_process_exp(input_ids, eps=eps, block_size=block_size, half_life_ratio=self.config.tok_mask_half_life_ratio, loss_mask=loss_mask)
686
+ else:
687
+ noisy_inputs, masked_indices, p_mask = self.forward_process(input_ids, eps=eps, block_size=block_size, loss_mask=loss_mask)
688
+
689
+ else:
690
+ noisy_inputs = input_ids
691
+ masked_indices = None
692
+ p_mask = None
693
+
694
+ if self.config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
695
+ for layer in self.encoder.layers:
696
+ if hasattr(layer.self_attn, 'set_attention_mode'):
697
+ layer.self_attn.set_attention_mode(self.config.dlm_paradigm, block_size=block_size)
698
+
699
+ input_ids_len = noisy_inputs.shape[1]
700
+ if labels is not None and self.config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
701
+ if position_ids is None:
702
+ position_ids = torch.arange(input_ids_len, device=noisy_inputs.device).unsqueeze(0)
703
+ noisy_inputs = torch.cat([noisy_inputs, input_ids], dim=1)
704
+
705
+ if block_diff_ppl:
706
+ if position_ids is None:
707
+ position_ids = torch.arange(input_ids_len // 2, device=noisy_inputs.device).unsqueeze(0)
708
+
709
+ enc_out = self.encoder(
710
+ past_key_values=past_key_values,
711
+ input_ids=noisy_inputs,
712
+ attention_mask=attention_mask,
713
+ position_ids=position_ids,
714
+ is_training=(labels is not None) or (block_diff_ppl),
715
+ **kwargs,
716
+ )
717
+
718
+ if output_last_hidden_states_only:
719
+ return BaseModelOutput(last_hidden_state=enc_out.last_hidden_state)
720
+
721
+ logits = self.diffusion_head(enc_out.last_hidden_state) # (batch, len_B, vocab)
722
+ causal_logits = None
723
+
724
+ if labels is not None and self.config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
725
+ if self.config.dlm_paradigm == 'sbd_block_diff':
726
+ causal_logits = logits[:, input_ids_len:]
727
+ else:
728
+ causal_logits = None
729
+
730
+ logits = logits[:, :input_ids_len]
731
+
732
+ loss = None
733
+ if labels is not None and not skip_loss:
734
+ if self.config.dlm_paradigm == 'autoregressive':
735
+ shift_logits = logits[..., :-1, :].contiguous()
736
+ shift_labels = labels[..., 1:].contiguous()
737
+
738
+ if loss_mask is None:
739
+ loss_fct = CrossEntropyLoss()
740
+ shift_logits = shift_logits.view(-1, shift_logits.size(-1))
741
+ shift_labels = shift_labels.view(-1)
742
+ loss = loss_fct(shift_logits, shift_labels)
743
+
744
+ else:
745
+ loss_mask = loss_mask[..., 1:].contiguous()
746
+
747
+ loss_fct = CrossEntropyLoss(reduction='none')
748
+ shift_logits = shift_logits.view(-1, shift_logits.size(-1))
749
+ shift_labels = shift_labels.view(-1)
750
+ shift_labels = shift_labels.to(shift_logits.device)
751
+
752
+ token_losses = loss_fct(shift_logits, shift_labels)
753
+
754
+ flat_loss_mask = loss_mask.reshape(-1)
755
+ loss = token_losses[flat_loss_mask == 1].sum() / flat_loss_mask.sum()
756
+
757
+ else:
758
+ # Handle DREAM vs LLADA style losses
759
+ if hasattr(self.config, 'dlm_type') and self.config.dlm_type == 'dream':
760
+ logits = logits[..., :-1, :].contiguous()
761
+ labels = labels[..., 1:].contiguous()
762
+ masked_indices = masked_indices[:, 1:]
763
+ p_mask = p_mask[:, 1:]
764
+
765
+ if self.config.ada_perm_ratio_per_block is not None:
766
+ # Only compute loss for the top ada_perm_ratio_per_block tokens by confidence within each block
767
+ block_size = self.config.block_size
768
+ batch_size, seq_len = masked_indices.shape
769
+ num_blocks = seq_len // block_size
770
+
771
+ # Get the max logit (confidence) for each position
772
+ confidence = logits.max(dim=-1).values.detach() # (batch_size, seq_len)
773
+
774
+ # Create a mask for tokens to include in loss
775
+ selected_mask = torch.zeros_like(masked_indices, dtype=torch.bool)
776
+
777
+ for blk in range(num_blocks):
778
+ start = blk * block_size
779
+ end = min((blk + 1) * block_size, seq_len)
780
+
781
+ # Get masked indices within this block
782
+ block_masked = masked_indices[:, start:end] # (batch_size, block_len)
783
+ block_confidence = confidence[:, start:end] # (batch_size, block_len)
784
+
785
+ for b in range(batch_size):
786
+ # Get positions that are masked in this block for this batch
787
+ masked_positions = torch.where(block_masked[b])[0]
788
+ num_masked = len(masked_positions)
789
+
790
+ if num_masked > 0:
791
+ # Number of tokens to keep (top by confidence)
792
+ k = min(max(1, int(block_size * self.config.ada_perm_ratio_per_block)), num_masked)
793
+
794
+ # Get confidence values for masked positions
795
+ masked_confidence = block_confidence[b, masked_positions]
796
+
797
+ # Get indices of top-k confident tokens
798
+ _, topk_indices = torch.topk(masked_confidence, k)
799
+ selected_positions = masked_positions[topk_indices]
800
+
801
+ # Mark these positions in the selected mask
802
+ selected_mask[b, start + selected_positions] = True
803
+
804
+ # Calculate loss only for selected positions
805
+ token_loss = torch.nn.functional.cross_entropy(
806
+ logits[selected_mask],
807
+ labels[selected_mask],
808
+ reduction='none'
809
+ ) / p_mask[selected_mask]
810
+
811
+ num_mask_tokens = selected_mask.sum()
812
+
813
+ else:
814
+ # Calculate token-wise cross entropy loss for masked positions in B
815
+ token_loss = torch.nn.functional.cross_entropy(
816
+ logits[masked_indices],
817
+ labels[masked_indices],
818
+ reduction='none'
819
+ ) / p_mask[masked_indices]
820
+
821
+ num_mask_tokens = masked_indices.sum()
822
+
823
+ if self.config.global_loss_avg:
824
+ loss = token_loss.sum()
825
+ else:
826
+ loss = token_loss.sum() / num_mask_tokens
827
+
828
+ if self.config.ada_dlm_loss_ratio is not None:
829
+ assert self.current_iter_ratio is not None
830
+ assert self.config.dlm_loss_weight is not None
831
+
832
+ dlm_loss_weight = min(self.config.dlm_loss_weight, self.current_iter_ratio / self.config.ada_dlm_loss_ratio * self.config.dlm_loss_weight)
833
+ loss = dlm_loss_weight * loss
834
+
835
+ elif self.config.dlm_loss_weight is not None:
836
+ loss = self.config.dlm_loss_weight * loss
837
+
838
+ if self.config.dlm_paradigm == 'sbd_block_diff':
839
+ causal_logits = causal_logits[..., :-1, :].contiguous()
840
+ causal_logits = causal_logits.view(-1, causal_logits.size(-1))
841
+
842
+ if hasattr(self.config, 'dlm_type') and self.config.dlm_type == 'dream':
843
+ causal_labels = labels.view(-1)
844
+ else:
845
+ causal_labels = labels[..., 1:].contiguous().view(-1)
846
+
847
+ if self.config.global_loss_avg:
848
+ loss_fct = CrossEntropyLoss(reduction='sum')
849
+ ar_loss = loss_fct(causal_logits, causal_labels)
850
+
851
+ self.loss_diffusion = loss.detach().item() / num_mask_tokens
852
+ self.loss_ar = ar_loss.detach().item() / seq_len
853
+
854
+ loss = loss + self.config.ar_loss_weight * ar_loss
855
+ else:
856
+ loss_fct = CrossEntropyLoss()
857
+ ar_loss = loss_fct(causal_logits, causal_labels)
858
+
859
+ self.loss_diffusion = loss.detach().item()
860
+ self.loss_ar = ar_loss.detach().item()
861
+
862
+ loss = loss + self.config.ar_loss_weight * ar_loss
863
+
864
+ if self.config.global_loss_avg:
865
+ if self.config.dlm_paradigm == 'sbd_block_diff':
866
+ loss = (loss, num_mask_tokens + int(self.config.ar_loss_weight * seq_len))
867
+ else:
868
+ loss = (loss, num_mask_tokens)
869
+
870
+ return MinistralDiffOutputWithPast(
871
+ loss=loss if not is_teacher else logits,
872
+ logits=logits,
873
+ causal_logits=causal_logits,
874
+ past_key_values=enc_out.past_key_values,
875
+ hidden_states=None,
876
+ attentions=None,
877
+ )
878
+
879
+
880
+ def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None):
881
+ out_ids, nfe = generate_with_prefix_cache_block_diff(
882
+ model=self,
883
+ prompt=prompt_ids,
884
+ gen_length=max_new_tokens,
885
+ steps=steps,
886
+ block_length=block_length,
887
+ remasking="low_confidence",
888
+ temperature=temperature,
889
+ mask_id=self.mask_token_id,
890
+ threshold=threshold,
891
+ shift_logits=shift_logits,
892
+ neg_entropy=False,
893
+ causal_context=causal_context,
894
+ eos_token_id=eos_token_id,
895
+ )
896
+
897
+ return out_ids, nfe
898
+
899
+
900
+ @torch.no_grad()
901
+ def sbd_inference_diffusion_quadratic(
902
+ self,
903
+ clean_input_ids: Optional[torch.Tensor],
904
+ draft_input_ids: torch.Tensor,
905
+ block_length: int,
906
+ draft_only: bool = False,
907
+ past_key_values: Optional[Cache] = None,
908
+ use_cache: bool = False,
909
+ ):
910
+ """SBD quadratic inference (injected by build_hf_tidar_repo)."""
911
+ enc_config = self.encoder.config
912
+ enc_config.use_sbd_objective = True
913
+ enc_config.block_length = block_length
914
+
915
+ if draft_only:
916
+ assert clean_input_ids is not None
917
+
918
+ if use_cache and past_key_values is None:
919
+ past_key_values = DynamicCache()
920
+
921
+ enc_config.tidar_inference_mode = "default"
922
+ input_ids = torch.cat([clean_input_ids, draft_input_ids], dim=-1)
923
+ outputs = self.encoder(
924
+ input_ids=input_ids,
925
+ position_ids=None,
926
+ past_key_values=past_key_values,
927
+ use_cache=use_cache,
928
+ is_training=False,
929
+ )
930
+
931
+ hidden_states = outputs.last_hidden_state
932
+ logits = self.diffusion_head(hidden_states)
933
+
934
+ past_key_values = getattr(outputs, "past_key_values", None)
935
+ if use_cache and past_key_values is not None:
936
+ _crop_dynamic_cache(past_key_values, clean_input_ids.shape[1])
937
+
938
+ return logits, past_key_values
939
+ else:
940
+ enc_config.tidar_inference_mode = "quadratic"
941
+
942
+ draft_len = block_length * (block_length + 1)
943
+ draft_input_ids = torch.cat(
944
+ [
945
+ draft_input_ids.view(-1, block_length, 1),
946
+ torch.full(
947
+ (draft_input_ids.shape[0], block_length, block_length),
948
+ fill_value=self.config.mask_token_id,
949
+ device=draft_input_ids.device,
950
+ ),
951
+ ],
952
+ dim=-1,
953
+ ).view(-1, draft_len)
954
+
955
+ if use_cache:
956
+ assert past_key_values is not None, (
957
+ "Past key values should be provided when using cache, e.g. run draft_only=True first."
958
+ )
959
+ assert clean_input_ids is None, (
960
+ "Clean input ids should already be in cache, thus none should be provided."
961
+ )
962
+ clean_len = past_key_values.get_seq_length()
963
+ input_ids = draft_input_ids
964
+ else:
965
+ clean_len = clean_input_ids.shape[1]
966
+ input_ids = torch.cat([clean_input_ids, draft_input_ids], dim=-1)
967
+
968
+ per_block_position_ids = torch.arange(
969
+ clean_len, clean_len + block_length + 1, device=draft_input_ids.device
970
+ )[None,].repeat(block_length, 1)
971
+ per_block_position_ids += torch.arange(block_length, device=draft_input_ids.device).view(-1, 1)
972
+
973
+ if use_cache:
974
+ position_ids = per_block_position_ids.view(-1)[None,]
975
+ else:
976
+ clean_position_ids = torch.arange(clean_len, device=draft_input_ids.device)
977
+ position_ids = torch.cat([clean_position_ids, per_block_position_ids.view(-1)], dim=-1)[None,]
978
+
979
+ outputs = self.encoder(
980
+ input_ids=input_ids,
981
+ position_ids=position_ids,
982
+ past_key_values=past_key_values,
983
+ use_cache=use_cache,
984
+ is_training=False,
985
+ )
986
+
987
+ hidden_states = outputs.last_hidden_state
988
+ logits = self.diffusion_head(hidden_states)
989
+ past_key_values = getattr(outputs, "past_key_values", None)
990
+
991
+ if use_cache and past_key_values is not None:
992
+ _extract_draft_kv_cache(past_key_values, clean_len, block_length)
993
+
994
+ return logits, past_key_values
995
+
996
+ @torch.no_grad()
997
+ def tidar_generate(
998
+ self,
999
+ prompt_ids: torch.Tensor,
1000
+ max_new_tokens: int = 128,
1001
+ steps: int = 128,
1002
+ block_length: int = 16,
1003
+ threshold: Optional[float] = None,
1004
+ temperature: float = 0.0,
1005
+ mask_token_id: Optional[int] = None,
1006
+ eos_token_id: Optional[int] = None,
1007
+ ):
1008
+ """TiDAR quadratic speculative decoding (injected by build_hf_tidar_repo)."""
1009
+ self.config.use_sbd_objective = True
1010
+ self.config.dlm_paradigm = "sbd"
1011
+
1012
+ if prompt_ids.shape[0] != 1:
1013
+ raise ValueError("TiDAR quadratic decoding currently requires batch_size == 1")
1014
+
1015
+ token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
1016
+ if eos_token_id is None:
1017
+ eos_token_id = getattr(self.config, "eos_token_id", None)
1018
+
1019
+ x = torch.full(
1020
+ (1, prompt_ids.shape[1] + max_new_tokens + block_length * 2),
1021
+ token_mask_id,
1022
+ dtype=torch.long,
1023
+ device=prompt_ids.device,
1024
+ )
1025
+ x[:, : prompt_ids.shape[1]] = prompt_ids.clone()
1026
+
1027
+ if max_new_tokens % block_length != 0:
1028
+ raise ValueError("max_new_tokens must be divisible by block_length")
1029
+ num_blocks = max_new_tokens // block_length
1030
+ if steps % num_blocks != 0:
1031
+ raise ValueError("steps must be divisible by (max_new_tokens // block_length)")
1032
+
1033
+ prompt_len = prompt_ids.shape[1]
1034
+ nfe = 0
1035
+ nfe += 1
1036
+ logits, past_key_values = self.sbd_inference_diffusion_quadratic(
1037
+ clean_input_ids=x[:, :prompt_len],
1038
+ draft_input_ids=x[:, prompt_len : prompt_len + block_length],
1039
+ block_length=block_length,
1040
+ draft_only=True,
1041
+ use_cache=True,
1042
+ )
1043
+
1044
+ logits_proposal = logits[:, prompt_len - 1 : prompt_len + block_length]
1045
+ logits_proposal[:, 1] = logits_proposal[:, 0]
1046
+ logits_proposal = logits_proposal[:, 1:]
1047
+ x0_proposal = torch.argmax(logits_proposal, dim=-1)
1048
+ x[:, prompt_len : prompt_len + block_length] = x0_proposal
1049
+
1050
+ total_accept_token = 0
1051
+ while True:
1052
+ nfe += 1
1053
+ block_start = prompt_len + total_accept_token
1054
+ block_end = block_start + block_length
1055
+ draft_input_ids = x[:, block_start:block_end]
1056
+
1057
+ logits, past_key_values = self.sbd_inference_diffusion_quadratic(
1058
+ clean_input_ids=None,
1059
+ draft_input_ids=draft_input_ids,
1060
+ block_length=block_length,
1061
+ draft_only=False,
1062
+ past_key_values=past_key_values,
1063
+ use_cache=True,
1064
+ )
1065
+
1066
+ useful_token_logits = logits.view(1, block_length, block_length + 1, -1)
1067
+ if threshold is None:
1068
+ useful_token_logits[:, :, 1] = useful_token_logits[:, :, 0]
1069
+ else:
1070
+ if not (0.0 <= threshold <= 1.0):
1071
+ raise ValueError("threshold must be between 0 and 1")
1072
+ mix_logits = useful_token_logits[:, :, 0] * threshold + useful_token_logits[:, :, 1] * (1 - threshold)
1073
+ useful_token_logits[:, :, 0] = mix_logits
1074
+ useful_token_logits[:, :, 1] = mix_logits
1075
+
1076
+ if temperature > 0:
1077
+ useful_token_logits = useful_token_logits / temperature
1078
+
1079
+ useful_token_pred = torch.argmax(useful_token_logits, dim=-1)
1080
+ new_draft_input_ids = useful_token_pred[:, 0, 1:]
1081
+ accept_cnt = 1
1082
+
1083
+ while accept_cnt < block_length:
1084
+ if useful_token_pred[:, accept_cnt - 1, 0].item() != draft_input_ids[:, accept_cnt].item():
1085
+ break
1086
+ new_draft_input_ids = useful_token_pred[:, accept_cnt, 1:]
1087
+ accept_cnt += 1
1088
+
1089
+ x[:, block_start : block_start + accept_cnt] = draft_input_ids[:, :accept_cnt]
1090
+
1091
+ # EoS early stopping: all accepted tokens are finalized left-to-right,
1092
+ # so if any is EoS we can truncate and return immediately.
1093
+ if eos_token_id is not None:
1094
+ accepted = x[0, block_start : block_start + accept_cnt]
1095
+ eos_positions = (accepted == eos_token_id).nonzero(as_tuple=True)[0]
1096
+ if len(eos_positions) > 0:
1097
+ first_eos_rel = eos_positions[0].item()
1098
+ total_accept_token += first_eos_rel + 1
1099
+ output_end = prompt_len + total_accept_token
1100
+ return x[:, :output_end], nfe
1101
+
1102
+ x[:, block_start + accept_cnt : block_start + accept_cnt + block_length] = new_draft_input_ids
1103
+ past_key_values.crop(block_start + accept_cnt)
1104
+ total_accept_token += accept_cnt
1105
+
1106
+ if total_accept_token >= max_new_tokens:
1107
+ break
1108
+
1109
+ return x[:, : -(block_length * 2)], nfe
1110
+
1111
+
1112
+ __all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]
modeling_nemotron_labs_diffusion.py DELETED
@@ -1,870 +0,0 @@
1
- import copy
2
- from dataclasses import dataclass
3
- from typing import Optional, Tuple
4
- import numpy as np
5
-
6
- import torch
7
- import torch.nn.functional as F
8
- from torch import nn
9
- from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutput
10
- from transformers.utils import ModelOutput
11
-
12
- from torch.nn.attention.flex_attention import flex_attention, create_block_mask
13
-
14
- from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
15
-
16
- from transformers.processing_utils import Unpack
17
-
18
- from transformers.cache_utils import Cache, DynamicCache
19
-
20
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
21
-
22
- from transformers.generation import GenerationMixin
23
-
24
- import math
25
-
26
- from .modeling_ministral import Ministral3Model, Ministral3PreTrainedModel, Ministral3Attention, apply_rotary_pos_emb, repeat_kv, _get_llama_4_attn_scale
27
- from .configuration_nemotron_labs_diffusion import NemotronLabsDiffusionConfig
28
-
29
- __all__ = ["NemotronLabsDiffusionModel", "NemotronLabsDiffusionFlexAttention"]
30
-
31
- @dataclass
32
- class NemotronLabsDiffusionOutputWithPast(ModelOutput):
33
- loss: torch.FloatTensor | None = None
34
- logits: torch.FloatTensor | None = None
35
- causal_logits: torch.FloatTensor | None = None
36
- past_key_values: Cache | None = None
37
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
38
- attentions: tuple[torch.FloatTensor, ...] | None = None
39
-
40
-
41
- @torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs", dynamic=False)
42
- def fused_flex_attention(q, k, v, block_mask=None):
43
- return flex_attention(q, k, v, block_mask=block_mask)
44
-
45
-
46
- class NemotronLabsDiffusionFlexAttention(Ministral3Attention):
47
- def __init__(self, *args, **kwargs):
48
- super().__init__(*args, **kwargs)
49
-
50
- self.block_size = self.config.block_size
51
- self.block_diff_mask = None
52
-
53
- import torch._dynamo.config as dcfg
54
- dcfg.cache_size_limit = 512
55
-
56
- def compute_block_mask(self, mode, q_len, block_size=None):
57
-
58
- def block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
59
- x0_flag_q = (q_idx >= n)
60
- x0_flag_kv = (kv_idx >= n)
61
-
62
- # Compute block indices
63
- block_q = torch.where(x0_flag_q == 1,
64
- (q_idx - n) // block_size,
65
- q_idx // block_size)
66
- block_kv = torch.where(x0_flag_kv == 1,
67
- (kv_idx - n) // block_size,
68
- kv_idx // block_size)
69
-
70
- # **1. Block Diagonal Mask (M_BD) **
71
- block_diagonal = (block_q == block_kv) & (x0_flag_kv == 0) & (x0_flag_q == 0)
72
-
73
- # **2. Offset Block-Causal Mask (M_OBC) **
74
- offset_block_causal = (
75
- (block_q > block_kv)
76
- & (x0_flag_kv == 1)
77
- & (x0_flag_q == 0)
78
- )
79
-
80
- # **3. Fully Causal Mask (M_BC) **
81
- fully_causal = (q_idx >= kv_idx) & (x0_flag_kv == 1) & (x0_flag_q == 1)
82
-
83
- # **4. Combine Masks **
84
- return block_diagonal | offset_block_causal | fully_causal
85
-
86
- attn_mask = lambda b, h, q, kv: block_diff_mask(block_size, b, h, q, kv, q_len//2)
87
-
88
- block_mask = create_block_mask(
89
- attn_mask, B=None, H=None, Q_LEN=q_len, KV_LEN=q_len
90
- )
91
-
92
- return block_mask
93
-
94
-
95
- def forward(
96
- self,
97
- hidden_states: torch.Tensor,
98
- position_embeddings: Tuple[torch.Tensor, torch.Tensor],
99
- attention_mask: Optional[torch.Tensor],
100
- past_key_values: Optional[Cache] = None,
101
- cache_position: Optional[torch.LongTensor] = None,
102
- is_training: bool = True,
103
- **kwargs: Unpack[FlashAttentionKwargs],
104
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
105
- bsz, q_len, _ = hidden_states.size()
106
- input_shape = hidden_states.shape[:-1]
107
- hidden_shape = (*input_shape, -1, self.head_dim)
108
-
109
- query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
110
- key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
111
- value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
112
-
113
- cos, sin = position_embeddings
114
-
115
- if is_training:
116
- # Split query and key states in half along sequence length dimension
117
- q1, q2 = query_states.chunk(2, dim=2)
118
- k1, k2 = key_states.chunk(2, dim=2)
119
-
120
- # Apply RoPE independently to each half
121
- q1, k1 = apply_rotary_pos_emb(q1, k1, cos, sin)
122
- q2, k2 = apply_rotary_pos_emb(q2, k2, cos, sin)
123
-
124
- # Recombine the halves
125
- query_states = torch.cat([q1, q2], dim=2)
126
- key_states = torch.cat([k1, k2], dim=2)
127
- else:
128
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
129
-
130
- query_states = query_states * _get_llama_4_attn_scale(
131
- cache_position,
132
- self.config.rope_parameters.get("llama_4_scaling_beta"),
133
- self.config.rope_parameters.get("original_max_position_embeddings"),
134
- ).to(query_states.dtype)
135
-
136
- if past_key_values is not None:
137
- # sin and cos are specific to RoPE models; cache_position needed for the static cache
138
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
139
- key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
140
-
141
- key_states = repeat_kv(key_states, self.num_key_value_groups)
142
- value_states = repeat_kv(value_states, self.num_key_value_groups)
143
-
144
- if self.block_diff_mask is None or q_len != self.block_diff_mask.shape[-2]:
145
- block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
146
- else:
147
- block_mask = self.block_diff_mask
148
-
149
- attn_output = fused_flex_attention(query_states, key_states, value_states, block_mask=block_mask)
150
- attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
151
-
152
- attn_output = self.o_proj(attn_output)
153
-
154
- return attn_output, None
155
-
156
-
157
- class NemotronLabsDiffusionModel(Ministral3PreTrainedModel, GenerationMixin):
158
- """
159
- A single model with:
160
- - a bidirectional encoder + diffusion‐LM head over A
161
- - a causal decoder + LM head over B, conditioned on F_A
162
- """
163
-
164
- def __init__(self, config: NemotronLabsDiffusionConfig):
165
- super().__init__(config)
166
-
167
- self.mask_token_id = config.mask_token_id
168
-
169
- diffusion_config = copy.deepcopy(config)
170
- diffusion_config.diffusion_lm = True
171
-
172
- if config.dlm_paradigm == 'block_diff':
173
- diffusion_config.attn_class = NemotronLabsDiffusionFlexAttention
174
- elif config.dlm_paradigm in ['bidirectional', 'autoregressive']:
175
- diffusion_config.attn_class = Ministral3Attention
176
- if config.dlm_paradigm == 'autoregressive':
177
- diffusion_config.diffusion_lm = False
178
- else:
179
- raise ValueError(f"Unsupported DLM paradigm: {config.dlm_paradigm}")
180
-
181
- self.encoder = Ministral3Model(diffusion_config)
182
- self.diffusion_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
183
- self.vocab_size = config.vocab_size
184
-
185
- self.post_init()
186
-
187
-
188
- def get_input_embeddings(self):
189
- return self.encoder.embed_tokens
190
-
191
- def set_input_embeddings(self, value):
192
- self.encoder.embed_tokens = value
193
-
194
- def get_output_embeddings(self):
195
- return self.diffusion_head
196
-
197
- def set_output_embeddings(self, new_embeddings):
198
- self.diffusion_head = new_embeddings
199
-
200
-
201
- def forward_process(self, input_ids, eps=1e-3, block_size=None, loss_mask=None):
202
- b, l = input_ids.shape
203
- device = input_ids.device
204
-
205
- if self.config.dp_varying_mask_ratio:
206
- # Enable different random seeds for each DP rank during sampling
207
- import torch.distributed as dist
208
- dp_rank = 0
209
- if dist.is_initialized():
210
- try:
211
- dp_rank = dist.get_rank()
212
- except Exception:
213
- dp_rank = 0
214
- # Use a local generator to avoid affecting global RNG state
215
- generator = torch.Generator(device=device)
216
- generator.manual_seed(torch.seed() + dp_rank)
217
- else:
218
- generator = None
219
-
220
- t = torch.rand(b, device=device, generator=generator)
221
-
222
- p_mask = (1 - eps) * t + eps # shape: (b,)
223
- p_mask = p_mask[:, None].expand(-1, l) # shape: (b, l)
224
-
225
- masked_indices = torch.rand((b, l), device=device) < p_mask
226
-
227
- if loss_mask is not None:
228
- masked_indices[loss_mask == 0] = 0
229
-
230
- noisy_batch = torch.where(masked_indices, self.mask_token_id, input_ids)
231
-
232
- return noisy_batch, masked_indices, p_mask
233
-
234
-
235
- def forward(
236
- self,
237
- input_ids: torch.LongTensor,
238
- attention_mask: Optional[torch.Tensor] = None,
239
- position_ids: Optional[torch.LongTensor] = None,
240
- labels: Optional[torch.LongTensor] = None,
241
- split_len: Optional[int] = None,
242
- past_key_values: Optional[Cache] = None,
243
- block_size: Optional[int] = None,
244
- eps: float = 1e-3,
245
- is_teacher: bool = False,
246
- masked_indices: Optional[torch.Tensor] = None,
247
- p_mask: Optional[torch.Tensor] = None,
248
- teacher_logits: Optional[torch.Tensor] = None,
249
- masked_indices_teacher: Optional[torch.Tensor] = None,
250
- loss_mask: Optional[torch.Tensor] = None,
251
- ce_loss_weight: float = 1.0,
252
- output_last_hidden_states_only: bool = False,
253
- skip_loss: bool = False,
254
- **kwargs,
255
- ) -> CausalLMOutputWithPast:
256
-
257
- batch_size, seq_len = input_ids.shape
258
-
259
- if self.config.dlm_paradigm == 'block_diff':
260
- if labels is not None and block_size is None:
261
- block_size = self.config.block_size
262
- elif self.config.dlm_paradigm not in ('bidirectional', 'autoregressive'):
263
- raise ValueError(f"Unknown dLM paradigm: {self.config.dlm_paradigm}")
264
-
265
- if labels is not None and self.config.dlm_paradigm != 'autoregressive':
266
- if masked_indices is not None:
267
- # assert p_mask is not None
268
-
269
- if loss_mask is not None:
270
- masked_indices[loss_mask == 0] = 0
271
-
272
- noisy_inputs = torch.where(masked_indices, self.mask_token_id, input_ids)
273
-
274
- else:
275
- noisy_inputs, masked_indices, p_mask = self.forward_process(input_ids, eps=eps, block_size=block_size, loss_mask=loss_mask)
276
-
277
- else:
278
- noisy_inputs = input_ids
279
- masked_indices = None
280
- p_mask = None
281
-
282
- input_ids_len = noisy_inputs.shape[1]
283
- if labels is not None and self.config.dlm_paradigm == 'block_diff':
284
- if position_ids is None:
285
- position_ids = torch.arange(input_ids_len, device=noisy_inputs.device).unsqueeze(0)
286
- noisy_inputs = torch.cat([noisy_inputs, input_ids], dim=1)
287
-
288
- enc_out = self.encoder(
289
- past_key_values=past_key_values,
290
- input_ids=noisy_inputs,
291
- attention_mask=attention_mask,
292
- position_ids=position_ids,
293
- is_training=(labels is not None),
294
- **kwargs,
295
- )
296
-
297
- if output_last_hidden_states_only:
298
- return BaseModelOutput(last_hidden_state=enc_out.last_hidden_state)
299
-
300
- logits = self.diffusion_head(enc_out.last_hidden_state) # (batch, len_B, vocab)
301
- causal_logits = None
302
-
303
- if labels is not None and self.config.dlm_paradigm == 'block_diff':
304
- causal_logits = logits[:, input_ids_len:]
305
- logits = logits[:, :input_ids_len]
306
-
307
- loss = None
308
- if labels is not None and not skip_loss:
309
- if self.config.dlm_paradigm == 'autoregressive':
310
- shift_logits = logits[..., :-1, :].contiguous()
311
- shift_labels = labels[..., 1:].contiguous()
312
-
313
- if loss_mask is None:
314
- loss_fct = CrossEntropyLoss()
315
- shift_logits = shift_logits.view(-1, shift_logits.size(-1))
316
- shift_labels = shift_labels.view(-1)
317
- loss = loss_fct(shift_logits, shift_labels)
318
-
319
- else:
320
- loss_mask = loss_mask[..., 1:].contiguous()
321
-
322
- loss_fct = CrossEntropyLoss(reduction='none')
323
- shift_logits = shift_logits.view(-1, shift_logits.size(-1))
324
- shift_labels = shift_labels.view(-1)
325
- shift_labels = shift_labels.to(shift_logits.device)
326
-
327
- token_losses = loss_fct(shift_logits, shift_labels)
328
-
329
- flat_loss_mask = loss_mask.reshape(-1)
330
- loss = token_losses[flat_loss_mask == 1].sum() / flat_loss_mask.sum()
331
-
332
- else:
333
- # LLaDA-style diffusion loss on masked positions.
334
- # Token-wise cross entropy loss on masked positions.
335
- token_loss = torch.nn.functional.cross_entropy(
336
- logits[masked_indices],
337
- labels[masked_indices],
338
- reduction='none'
339
- ) / p_mask[masked_indices]
340
-
341
- num_mask_tokens = masked_indices.sum()
342
-
343
- # global_loss_avg=True: loss is reduced externally by global token count.
344
- loss = token_loss.sum()
345
-
346
- if self.config.dlm_loss_weight is not None:
347
- loss = self.config.dlm_loss_weight * loss
348
-
349
- if self.config.dlm_paradigm == 'block_diff':
350
- # AR-side loss for block-diffusion paradigm.
351
- causal_logits = causal_logits[..., :-1, :].contiguous()
352
- causal_logits = causal_logits.view(-1, causal_logits.size(-1))
353
- causal_labels = labels[..., 1:].contiguous().view(-1)
354
-
355
- loss_fct = CrossEntropyLoss(reduction='sum')
356
- ar_loss = loss_fct(causal_logits, causal_labels)
357
-
358
- self.loss_diffusion = loss.detach().item() / num_mask_tokens
359
- self.loss_ar = ar_loss.detach().item() / seq_len
360
-
361
- loss = loss + self.config.ar_loss_weight * ar_loss
362
-
363
- # global_loss_avg=True: return (sum_loss, token_count) for external mean.
364
- if self.config.dlm_paradigm == 'block_diff':
365
- loss = (loss, num_mask_tokens + int(self.config.ar_loss_weight * seq_len))
366
- else:
367
- loss = (loss, num_mask_tokens)
368
-
369
- return NemotronLabsDiffusionOutputWithPast(
370
- loss=loss if not is_teacher else logits,
371
- logits=logits,
372
- causal_logits=causal_logits,
373
- past_key_values=enc_out.past_key_values,
374
- hidden_states=None,
375
- attentions=None,
376
- )
377
-
378
-
379
- @torch.no_grad()
380
- def generate(
381
- self,
382
- prompt_ids: torch.Tensor,
383
- max_new_tokens: int,
384
- block_length: int,
385
- threshold: Optional[float] = None,
386
- causal_context: bool = True,
387
- temperature: float = 0.0,
388
- eos_token_id: Optional[int] = None,
389
- max_thinking_tokens: Optional[int] = None,
390
- end_think_token_id: Optional[int] = None,
391
- ):
392
- """Block-wise diffusion decoding with prefix-cached KV (LLaDA-style).
393
-
394
- Each block: append `block_length` mask tokens, then iteratively unmask
395
- by confidence top-k (with optional threshold). When `causal_context`,
396
- the KV cache and the next-block seed are produced via a causal forward
397
- between blocks (flipping `self_attn.diffusion_lm`), matching the AR
398
- objective at block boundaries.
399
-
400
- Returns (output_ids, nfe) — output_ids includes the prompt.
401
- """
402
- if eos_token_id is None:
403
- eos_token_id = getattr(self.config, "eos_token_id", None)
404
- mask_id = self.mask_token_id
405
-
406
- x_accum = prompt_ids.clone()
407
- B = prompt_ids.shape[0]
408
-
409
- assert max_new_tokens % block_length == 0
410
- num_blocks = max_new_tokens // block_length
411
- # one denoising step per generated token (matches legacy chat_utils call)
412
- steps_per_block = block_length
413
-
414
- nfe = 0
415
-
416
- def _set_diffusion_lm(val: bool):
417
- for layer in self.encoder.layers:
418
- if hasattr(layer.self_attn, "diffusion_lm"):
419
- layer.self_attn.diffusion_lm = val
420
-
421
- # Initial causal prefill produces the KV cache and the next-block seed.
422
- if causal_context:
423
- _set_diffusion_lm(False)
424
- output = self(prompt_ids, use_cache=True, use_causal_mask=causal_context)
425
- past_key_values = output.past_key_values
426
- if causal_context:
427
- _set_diffusion_lm(True)
428
-
429
- next_token = None
430
- if causal_context:
431
- last_logit = output.logits[:, -1, :]
432
- if temperature > 0:
433
- next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), num_samples=1)
434
- else:
435
- next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
436
-
437
- for num_block in range(num_blocks):
438
- mask_block = torch.full(
439
- (B, block_length), mask_id, dtype=prompt_ids.dtype, device=prompt_ids.device,
440
- )
441
- if causal_context:
442
- mask_block[:, 0] = next_token[:, 0]
443
-
444
- x_accum = torch.cat([x_accum, mask_block], dim=1)
445
- block_start = prompt_ids.size(1) + num_block * block_length
446
- block_slice = slice(block_start, block_start + block_length)
447
-
448
- # Thinking-budget enforcement: if we've passed max_thinking_tokens
449
- # without an end-think marker, inject one into this block.
450
- if end_think_token_id is not None and max_thinking_tokens is not None:
451
- tokens_before = num_block * block_length
452
- tokens_after = tokens_before + block_length
453
- if tokens_after > max_thinking_tokens:
454
- gen_so_far = x_accum[:, prompt_ids.size(1):block_start]
455
- has_end_think = (
456
- (gen_so_far == end_think_token_id).any(dim=1)
457
- if gen_so_far.size(1) > 0
458
- else torch.zeros(B, dtype=torch.bool, device=prompt_ids.device)
459
- )
460
- if not has_end_think.all():
461
- offset = max(0, max_thinking_tokens - tokens_before)
462
- inject_pos = block_start + offset
463
- for b in range(B):
464
- if not has_end_think[b]:
465
- x_accum[b, inject_pos] = end_think_token_id
466
-
467
- mask_block_idx0 = x_accum[:, block_slice] == mask_id
468
- num_transfer_tokens = _get_num_transfer_tokens(mask_block_idx0, steps_per_block)
469
-
470
- # Denoise the current block by repeated confidence-based unmasking.
471
- for i in range(steps_per_block):
472
- mask_block_idx = x_accum[:, block_slice] == mask_id
473
- if mask_block_idx.sum() == 0:
474
- break
475
-
476
- nfe += 1
477
- logits_block = self(
478
- x_accum[:, block_slice],
479
- past_key_values=past_key_values,
480
- use_cache=False,
481
- ).logits
482
-
483
- x0, transfer_idx = _get_transfer_index(
484
- logits_block, temperature, mask_block_idx, x_accum[:, block_slice],
485
- num_transfer_tokens=num_transfer_tokens[:, i], threshold=threshold,
486
- )
487
- cur = x_accum[:, block_slice].clone()
488
- cur[transfer_idx] = x0[transfer_idx]
489
- x_accum[:, block_slice] = cur
490
-
491
- if eos_token_id is not None:
492
- block_tokens = x_accum[:, block_slice]
493
- eos_mask = block_tokens == eos_token_id
494
- if eos_mask.any(dim=1).any():
495
- after_eos = eos_mask.cumsum(dim=1).bool()
496
- mask_before = (block_tokens == mask_id) & ~after_eos
497
- if (eos_mask.any(dim=1) & ~mask_before.any(dim=1)).any():
498
- break
499
-
500
- # Post-block: causal forward over the block to update the KV cache
501
- # and (when causal_context) sample the seed for the next block.
502
- if causal_context:
503
- _set_diffusion_lm(False)
504
- output = self(
505
- x_accum[:, block_slice],
506
- past_key_values=past_key_values,
507
- use_cache=True,
508
- use_causal_mask=causal_context,
509
- )
510
- past_key_values = output.past_key_values
511
- nfe += 1
512
-
513
- if causal_context:
514
- _set_diffusion_lm(True)
515
- last_logit = output.logits[:, -1, :]
516
- if temperature > 0:
517
- next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), num_samples=1)
518
- else:
519
- next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
520
-
521
- if eos_token_id is not None:
522
- gen_so_far = x_accum[:, prompt_ids.size(1):]
523
- is_eos = gen_so_far == eos_token_id
524
- if is_eos.any(dim=1).all():
525
- first_eos = is_eos.to(torch.int64).argmax(dim=1)
526
- max_eos = first_eos.max().item()
527
- return x_accum[:, : prompt_ids.size(1) + max_eos + 1], nfe
528
-
529
- return x_accum, nfe
530
-
531
-
532
-
533
- @torch.no_grad()
534
- def ar_generate(
535
- self,
536
- prompt_ids: torch.Tensor,
537
- max_new_tokens: int = 128,
538
- temperature: float = 0.0,
539
- eos_token_id: Optional[int] = None,
540
- max_thinking_tokens: Optional[int] = None,
541
- end_think_token_id: Optional[int] = None,
542
- ) -> tuple:
543
- """Autoregressive generation calling the encoder directly (injected by build_hf_tidar_repo).
544
-
545
- Bypasses NemotronLabsDiffusionModel.forward() to avoid diffusion-specific
546
- code paths. Calls self.encoder (Ministral3Model) with explicit cache_position,
547
- position_ids, and use_cache so the KV cache and causal masking behave
548
- identically to MistralForCausalLM / vLLM.
549
-
550
- Returns:
551
- (output_ids, nfe) where output_ids includes the prompt.
552
- """
553
- for layer in self.encoder.layers:
554
- if hasattr(layer.self_attn, 'diffusion_lm'):
555
- layer.self_attn.diffusion_lm = False
556
-
557
- if eos_token_id is None:
558
- eos_token_id = getattr(self.config, 'eos_token_id', None)
559
-
560
- device = prompt_ids.device
561
- batch_size, prompt_len = prompt_ids.shape
562
-
563
- past_key_values = DynamicCache()
564
- cache_position = torch.arange(prompt_len, device=device)
565
- position_ids = cache_position.unsqueeze(0).expand(batch_size, -1)
566
-
567
- enc_out = self.encoder(
568
- input_ids=prompt_ids,
569
- position_ids=position_ids,
570
- past_key_values=past_key_values,
571
- use_cache=True,
572
- cache_position=cache_position,
573
- )
574
- past_key_values = enc_out.past_key_values
575
- next_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
576
-
577
- generated_tokens = []
578
- nfe = 0
579
-
580
- for step in range(max_new_tokens):
581
- nfe += 1
582
-
583
- if temperature > 0:
584
- probs = torch.softmax(next_logit / temperature, dim=-1)
585
- next_token = torch.multinomial(probs, num_samples=1)
586
- else:
587
- next_token = torch.argmax(next_logit, dim=-1, keepdim=True)
588
-
589
- # ---- thinking budget enforcement ----
590
- if end_think_token_id is not None and max_thinking_tokens is not None:
591
- if step >= max_thinking_tokens:
592
- if generated_tokens:
593
- gen_tensor = torch.cat(generated_tokens, dim=1)
594
- has_end_think = (gen_tensor == end_think_token_id).any(dim=1)
595
- else:
596
- has_end_think = torch.zeros(batch_size, dtype=torch.bool, device=device)
597
- for b in range(batch_size):
598
- if not has_end_think[b]:
599
- next_token[b] = end_think_token_id
600
-
601
- generated_tokens.append(next_token)
602
-
603
- if eos_token_id is not None and (next_token == eos_token_id).all():
604
- break
605
-
606
- if step < max_new_tokens - 1:
607
- cur_pos = prompt_len + step
608
- step_cache_pos = torch.tensor([cur_pos], device=device)
609
- step_pos_ids = step_cache_pos.unsqueeze(0).expand(batch_size, -1)
610
-
611
- enc_out = self.encoder(
612
- input_ids=next_token,
613
- position_ids=step_pos_ids,
614
- past_key_values=past_key_values,
615
- use_cache=True,
616
- cache_position=step_cache_pos,
617
- )
618
- past_key_values = enc_out.past_key_values
619
- next_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
620
-
621
- all_generated = torch.cat(generated_tokens, dim=1)
622
- output_ids = torch.cat([prompt_ids, all_generated], dim=1)
623
- return output_ids, nfe
624
-
625
-
626
- @torch.no_grad()
627
- def linear_spec_generate(
628
- self,
629
- prompt_ids: torch.Tensor,
630
- max_new_tokens: int = 128,
631
- block_length: int = 32,
632
- temperature: float = 0.0,
633
- mask_token_id: Optional[int] = None,
634
- eos_token_id: Optional[int] = None,
635
- max_thinking_tokens: Optional[int] = None,
636
- end_think_token_id: Optional[int] = None,
637
- threshold: float = 0.0,
638
- ):
639
- """Linear speculative decoding: diffusion draft + AR verify.
640
-
641
- Each iteration: (1) draft the next block under bidirectional attention,
642
- (2) verify the drafted block under causal attention, accept the longest
643
- prefix where draft matches AR + one bonus token, advance the KV cache.
644
-
645
- LoRA-aware: when a PEFT adapter is attached to the model (e.g.
646
- ``linear_spec_lora``), it is toggled ON for the bidirectional draft
647
- phase and OFF for the causal prefill / verify phases — so the adapter
648
- only specializes the diffusion-mode forward and AR semantics are
649
- preserved. With no adapter loaded the calls are no-ops.
650
-
651
- Returns ``(output_ids, nfe)`` — ``output_ids`` includes the prompt.
652
- """
653
- if prompt_ids.shape[0] != 1:
654
- raise ValueError("Linear speculative decoding requires batch_size == 1")
655
-
656
- token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
657
- if eos_token_id is None:
658
- eos_token_id = getattr(self.config, "eos_token_id", None)
659
-
660
- device = prompt_ids.device
661
-
662
- def _set_diffusion_lm(val: bool):
663
- for layer in self.encoder.layers:
664
- if hasattr(layer.self_attn, "diffusion_lm"):
665
- layer.self_attn.diffusion_lm = val
666
-
667
- def _toggle_adapters(enable: bool):
668
- # No-op when no PEFT/LoRA modules are attached.
669
- for module in self.modules():
670
- if hasattr(module, "_disable_adapters"):
671
- module._disable_adapters = not enable
672
-
673
- # Prefill (causal, LoRA OFF).
674
- _set_diffusion_lm(False)
675
- _toggle_adapters(False)
676
- enc_out = self.encoder(
677
- input_ids=prompt_ids,
678
- past_key_values=DynamicCache(),
679
- use_cache=True,
680
- use_causal_mask=True,
681
- )
682
- past_key_values = enc_out.past_key_values
683
- last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
684
- nfe = 1
685
-
686
- if temperature > 0:
687
- next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), num_samples=1)
688
- else:
689
- next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
690
-
691
- if eos_token_id is not None and next_token.item() == eos_token_id:
692
- return torch.cat([prompt_ids, next_token], dim=1), nfe
693
-
694
- generated = [next_token]
695
- total_gen = 1
696
-
697
- while total_gen < max_new_tokens:
698
- cache_len = past_key_values.get_seq_length()
699
-
700
- block = torch.full((1, block_length), token_mask_id, dtype=torch.long, device=device)
701
- block[0, 0] = next_token.item()
702
-
703
- # Draft phase (bidirectional, LoRA ON) — iterate at threshold>0 so
704
- # that even low-confidence blocks make progress.
705
- _set_diffusion_lm(True)
706
- _toggle_adapters(True)
707
- while True:
708
- is_mask = block == token_mask_id
709
- if not is_mask.any():
710
- break
711
-
712
- enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=False)
713
- nfe += 1
714
-
715
- draft_logits = self.diffusion_head(enc_out.last_hidden_state)
716
- # LLaDA: logit[i] directly predicts position i — no shift needed.
717
-
718
- if temperature > 0:
719
- draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
720
- draft_tokens = torch.multinomial(
721
- draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1
722
- ).view(1, block_length)
723
- else:
724
- draft_tokens = draft_logits.argmax(dim=-1)
725
- draft_probs = torch.softmax(draft_logits, dim=-1)
726
-
727
- if threshold > 0:
728
- draft_conf = torch.gather(draft_probs, -1, draft_tokens.unsqueeze(-1)).squeeze(-1)
729
- draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
730
- unmask = draft_conf >= threshold
731
- # Force progress even when every masked position is below threshold.
732
- if not unmask.any():
733
- best_idx = draft_conf.view(-1).argmax()
734
- unmask = torch.zeros_like(is_mask, dtype=torch.bool)
735
- unmask.view(-1)[best_idx] = True
736
- block[unmask] = draft_tokens[unmask]
737
- else:
738
- block[is_mask] = draft_tokens[is_mask]
739
- break
740
-
741
- # Verify phase (causal, LoRA OFF).
742
- _set_diffusion_lm(False)
743
- _toggle_adapters(False)
744
- enc_out = self.encoder(
745
- input_ids=block,
746
- past_key_values=past_key_values,
747
- use_cache=True,
748
- use_causal_mask=True,
749
- )
750
- past_key_values = enc_out.past_key_values
751
- nfe += 1
752
-
753
- verify_logits = self.diffusion_head(enc_out.last_hidden_state)
754
- if temperature > 0:
755
- ar_tokens = torch.multinomial(
756
- torch.softmax(verify_logits / temperature, dim=-1).view(-1, verify_logits.shape[-1]),
757
- num_samples=1,
758
- ).view(1, block_length)
759
- else:
760
- ar_tokens = verify_logits.argmax(dim=-1)
761
-
762
- # Accept consecutive matches; AR also gives one bonus token at the tail.
763
- accepted = 0
764
- for i in range(block_length - 1):
765
- if ar_tokens[0, i].item() == block[0, i + 1].item():
766
- accepted += 1
767
- else:
768
- break
769
- accepted += 1
770
-
771
- accepted_toks = ar_tokens[:, :accepted]
772
- generated.append(accepted_toks)
773
- total_gen += accepted
774
-
775
- _crop_dynamic_cache(past_key_values, cache_len + accepted)
776
- next_token = ar_tokens[:, accepted - 1 : accepted]
777
-
778
- if eos_token_id is not None:
779
- eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
780
- if len(eos_pos) > 0:
781
- first_eos = eos_pos[0].item()
782
- generated[-1] = accepted_toks[:, : first_eos + 1]
783
- total_gen = total_gen - accepted + first_eos + 1
784
- break
785
-
786
- # Thinking-budget enforcement: force end-think as next seed if budget exhausted.
787
- if end_think_token_id is not None and max_thinking_tokens is not None:
788
- if total_gen > max_thinking_tokens:
789
- all_gen = torch.cat(generated, dim=1)
790
- if not (all_gen == end_think_token_id).any():
791
- next_token = torch.tensor([[end_think_token_id]], device=device)
792
-
793
- if total_gen >= max_new_tokens:
794
- break
795
-
796
- all_generated = torch.cat(generated, dim=1)
797
- output_ids = torch.cat([prompt_ids, all_generated], dim=1)
798
- return output_ids, nfe
799
-
800
-
801
- # ─── Module-level helpers used by `generate` and `linear_spec_generate` ──
802
-
803
- def _crop_dynamic_cache(past_key_values: DynamicCache, max_length: int):
804
- """Crop a DynamicCache to max_length, compatible with both old and new transformers."""
805
- if hasattr(past_key_values, 'crop'):
806
- past_key_values.crop(max_length)
807
- else:
808
- for layer_idx in range(len(past_key_values)):
809
- past_key_values.key_cache[layer_idx] = past_key_values.key_cache[layer_idx][:, :, :max_length]
810
- past_key_values.value_cache[layer_idx] = past_key_values.value_cache[layer_idx][:, :, :max_length]
811
- past_key_values._seen_tokens = max_length
812
-
813
-
814
- def _add_gumbel_noise(logits, temperature):
815
- """Gumbel-max sampling in float64 (low-precision Gumbel hurts MDM quality)."""
816
- if temperature == 0:
817
- return logits
818
- logits = logits.to(torch.float64)
819
- noise = torch.rand_like(logits, dtype=torch.float64)
820
- gumbel_noise = (- torch.log(noise)) ** temperature
821
- return logits.exp() / gumbel_noise
822
-
823
-
824
- def _get_num_transfer_tokens(mask_index, steps: int):
825
- """Even split of masked positions across `steps`, with remainder front-loaded."""
826
- mask_num = mask_index.sum(dim=1, keepdim=True)
827
- base = mask_num // steps
828
- remainder = mask_num % steps
829
- num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
830
- for i in range(mask_num.size(0)):
831
- num_transfer_tokens[i, : int(remainder[i])] += 1
832
- return num_transfer_tokens
833
-
834
-
835
- def _get_transfer_index(logits, temperature, mask_index, x, num_transfer_tokens, threshold=None):
836
- """Pick which masked positions to commit this denoising step.
837
-
838
- Returns (x0, transfer_index): x0 is argmax tokens (clamped to original x at
839
- non-masked positions); transfer_index is a bool mask over positions to
840
- finalize, chosen by top-k confidence (and filtered by `threshold` if given).
841
- """
842
- logits_with_noise = _add_gumbel_noise(logits, temperature=temperature)
843
- x0 = torch.argmax(logits_with_noise, dim=-1)
844
-
845
- p = F.softmax(logits, dim=-1)
846
- x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
847
-
848
- x0 = torch.where(mask_index, x0, x)
849
- confidence = torch.where(mask_index, x0_p, -np.inf)
850
-
851
- transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
852
- if threshold is not None:
853
- num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
854
- for j in range(confidence.shape[0]):
855
- _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j])
856
- transfer_index[j, select_index] = True
857
- if threshold is not None:
858
- for k in range(1, num_transfer_tokens[j]):
859
- if confidence[j, select_index[k]] < threshold:
860
- transfer_index[j, select_index[k]] = False
861
- return x0, transfer_index
862
-
863
-
864
- def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
865
- """Return a Bool mask of length len(log_w) with exactly k True."""
866
- g = -torch.log(-torch.log(torch.rand_like(log_w) + 1e-9) + 1e-9)
867
- topk = torch.topk(log_w + g, k).indices
868
- mask = torch.zeros_like(log_w, dtype=torch.bool)
869
- mask[topk] = True
870
- return mask