Instructions to use nvidia/Efficient-DLM-4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Efficient-DLM-4B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Efficient-DLM-4B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True)
model = AutoModel.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Efficient-DLM-4B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Efficient-DLM-4B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Efficient-DLM-4B

SGLang

How to use nvidia/Efficient-DLM-4B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Efficient-DLM-4B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Efficient-DLM-4B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Efficient-DLM-4B with Docker Model Runner:
```
docker model run hf.co/nvidia/Efficient-DLM-4B
```

Update chat_utils.py

by lwhalen7 - opened Nov 17, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+609

-196

Files changed (10) hide show

.gitattributes +0 -2
README.md +14 -45
chat_utils.py +8 -21
config.json +4 -3
configuration_edlm.py → configuration_nvrdiff.py +4 -2
images/result.png → model-00001-of-00002.safetensors +2 -2
model.safetensors → model-00002-of-00002.safetensors +2 -2
model.safetensors.index.json +406 -0
modeling_edlm.py → modeling_nvrdiff.py +158 -102
modeling_qwen3.py +9 -15

.gitattributes CHANGED Viewed

@@ -34,5 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text
-*.pdf filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,40 +1,29 @@
 ---
 library_name: transformers
-license: other
-license_name: cc-by-nc-4.0
-pipeline_tag: text-generation
 ---
-# Efficient-DLM-4B
-<p align="center">
-  📄 <a href="https://arxiv.org/pdf/2512.14067">Tech Report</a> &nbsp&nbsp|&nbsp&nbsp 🤗 <a href="https://huggingface.co/nvidia/Efficient-DLM-4B">Efficient-DLM-4B</a> &nbsp&nbsp|&nbsp&nbsp 🤗 <a href="https://huggingface.co/nvidia/Efficient-DLM-8B">Efficient-DLM-8B</a>
-</p>
-## Model Overview
-Efficient-DLM-4B is a base diffusion language model designed for parallel generation. It converts pretrained AR LMs into diffusion LMs through efficient continuous pretraining, enabling faster decoding while preserving the task accuracy of strong AR models. Efficient-DLM features block-wise attention with clean-context conditioning for KV-cache-friendly decoding, as well as position-dependent token masking to reduce the training–test mismatch in diffusion generation. See our [paper](https://arxiv.org/abs/2512.14067) for more technical details.
-<div align="center">
-<img src="https://huggingface.co/nvidia/Efficient-DLM-4B/resolve/main/images/result.png" alt="Accuracy vs throughput Pareto curve" width="500">
-</div>
-## Environment
-```bash
-transformers>=4.52.2
 ```
-## Chat with Efficient-DLM-4B
-```python
 from transformers import AutoModel, AutoTokenizer
 import torch
-repo_name = "nvidia/Efficient-DLM-4B"
 tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
 model = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
@@ -42,30 +31,10 @@ model = model.cuda().to(torch.bfloat16)
 user_input = input("User: ").strip()
-prompt_ids = tokenizer(user_input, return_tensors="pt").input_ids.to(device="cuda")
-out_ids, nfe = model.generate(
-    prompt_ids,
-    max_new_tokens=128,
-    steps=128,
-    block_length=32,
-    shift_logits=False,
-    temperature=0.7,
-    threshold=0.9,
-)
-response = tokenizer.batch_decode(out_ids[:, prompt_ids.shape[1]:], skip_special_tokens=True)[0]
-print(f"Model: {response}")
 print(f"[Num Function Eval (NFE)={nfe}]")
-```
-## Citation
-```
-@article{fu2025efficient,
-  title={Efficient-dlm: From autoregressive to diffusion language models, and beyond in speed},
-  author={Fu, Yonggan and Whalen, Lexington and Ye, Zhifan and Dong, Xin and Diao, Shizhe and Liu, Jingyu and Wu, Chengyue and Zhang, Hao and Xie, Enze and Han, Song and others},
-  journal={arXiv preprint arXiv:2512.14067},
-  year={2025}
-}
 ```

 ---
 library_name: transformers
+tags: []
 ---
+# Nemotron-Diffusion-Research-4B-v0
+Developed by [DLER team](https://nv-dler.github.io/) @ NVR and will be updated actively. Contact Yonggan Fu and Pavlo Molchanov for any question.
+# Environment
+Docker path: `/lustre/fsw/portfolios/nvr/users/yongganf/docker/megatron_py25_dllm.sqsh` on OCI-ORD/OCI-NRT. Apply for interactive nodes with the following command:
+```
+srun -A {account} --partition interactive --time 4:00:00 --gpus 8 --container-image /lustre/fsw/portfolios/nvr/users/yongganf/docker/megatron_py25_dllm.sqsh --container-mounts=$HOME:/home,/lustre:/lustre  --pty bash
 ```
+## Chat with Our Model
+```
 from transformers import AutoModel, AutoTokenizer
 import torch
+repo_name = "nvidia/Nemotron-Diffusion-Research-4B-v0"
 tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
 model = AutoModel.from_pretrained(repo_name, trust_remote_code=True)
 user_input = input("User: ").strip()
+prompt_ids = tokenizer(user_input,return_tensors='pt').input_ids.to(device='cuda')
+out_ids, nfe = model.generate(prompt_ids, max_new_tokens=128, steps=128, block_length=32, shift_logits=False, threshold=0.9)
+tokenized_out = tokenizer.batch_decode(out_ids[:, prompt_ids.shape[1]:], skip_special_tokens=True)[0]
+print(f"Model: {tokenized_out}")
 print(f"[Num Function Eval (NFE)={nfe}]")
 ```

chat_utils.py CHANGED Viewed

@@ -3,32 +3,20 @@ import torch
 import torch.nn.functional as F
-def add_gumbel_noise(logits, temperature):
-    '''
-    The Gumbel max is a method for sampling categorical distributions.
-    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
-    Thus, we use float64.
-    '''
-    if temperature == 0:
-        return logits
-    logits = logits.to(torch.float64)
-    noise = torch.rand_like(logits, dtype=torch.float64)
-    gumbel_noise = (- torch.log(noise)) ** temperature
-    return logits.exp() / gumbel_noise
-def get_transfer_index(logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False):
-    logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
-    x0 = torch.argmax(logits_with_noise, dim=-1)
     if remasking == 'low_confidence':
         # p = F.softmax(logits.to(torch.float64), dim=-1)
-        p = F.softmax(logits, dim=-1)
         x0_p = torch.squeeze(
             torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
     elif remasking == 'top_p_margin':
         # Compute probabilities
-        p = F.softmax(logits, dim=-1)                       # (B, L, V)
         # Top-2 per position
         top2 = torch.topk(p, k=2, dim=-1).values            # (B, L, 2)
         margin = top2[..., 0] - top2[..., 1]                # (B, L)
@@ -64,7 +52,7 @@ def get_transfer_index(logits, temperature, remasking, mask_index, x, num_transf
     # Calculate negative entropy if requested
     if neg_entropy:
         # p = F.softmax(logits.to(torch.float64), dim=-1)
-        p = F.softmax(logits, dim=-1)
         epsilon = 1e-10
         log_probs = torch.log(p + epsilon)
         confidence_scores = torch.sum(p * log_probs, dim=-1)  # negative entropy per position
@@ -216,7 +204,6 @@ def generate_with_prefix_cache_block_diff(
             use_cache=True
         )
         past_key_values = output.past_key_values
-        nfe += 1
         if dream_style and num_block < num_blocks - 1:
             # refresh context-next logit for the next block

 import torch.nn.functional as F
+def get_transfer_index(logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None,neg_entropy=False):
+    x0 = torch.argmax(logits, dim=-1) # b, l
+    if temperature is None or temperature <= 0:
+        temperature = 1.0
     if remasking == 'low_confidence':
         # p = F.softmax(logits.to(torch.float64), dim=-1)
+        p = F.softmax(logits/temperature, dim=-1)
         x0_p = torch.squeeze(
             torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
     elif remasking == 'top_p_margin':
         # Compute probabilities
+        p = F.softmax(logits/temperature, dim=-1)                       # (B, L, V)
         # Top-2 per position
         top2 = torch.topk(p, k=2, dim=-1).values            # (B, L, 2)
         margin = top2[..., 0] - top2[..., 1]                # (B, L)
     # Calculate negative entropy if requested
     if neg_entropy:
         # p = F.softmax(logits.to(torch.float64), dim=-1)
+        p = F.softmax(logits/temperature, dim=-1)
         epsilon = 1e-10
         log_probs = torch.log(p + epsilon)
         confidence_scores = torch.sum(p * log_probs, dim=-1)  # negative entropy per position
             use_cache=True
         )
         past_key_values = output.past_key_values
         if dream_style and num_block < num_blocks - 1:
             # refresh context-next logit for the next block

config.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "adaptive_mask_rate": false,
   "architectures": [
-    "EfficientDLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "attn_implementation": "sdpa",
   "auto_map": {
-    "AutoConfig": "configuration_edlm.EfficientDLMConfig",
-    "AutoModel": "modeling_edlm.EfficientDLM"
   },
   "block_size": 32,
   "diff_loss_weight": 1,
@@ -38,6 +38,7 @@
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,
   "sliding_window": null,
   "tie_word_embeddings": false,
   "tok_mask_half_life_ratio": null,

 {
   "adaptive_mask_rate": false,
   "architectures": [
+    "DiffEncoderModel"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "attn_implementation": "sdpa",
   "auto_map": {
+    "AutoConfig": "configuration_nvrdiff.NVRDiffConfig",
+    "AutoModel": "modeling_nvrdiff.DiffEncoderModel"
   },
   "block_size": 32,
   "diff_loss_weight": 1,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,
+  "seq_length": 1024,
   "sliding_window": null,
   "tie_word_embeddings": false,
   "tok_mask_half_life_ratio": null,

configuration_edlm.py → configuration_nvrdiff.py RENAMED Viewed

@@ -22,7 +22,7 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-class EfficientDLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
     Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -172,6 +172,7 @@ class EfficientDLMConfig(PretrainedConfig):
         max_window_layers=28,
         attention_dropout=0.0,
         attn_implementation="sdpa",
         mask_token_id=-1,
         dlm_type='llada',
         random_length_prob=None,
@@ -221,6 +222,7 @@ class EfficientDLMConfig(PretrainedConfig):
         rope_config_validation(self)
         self.attn_implementation = attn_implementation
         self.mask_token_id = mask_token_id
         self.dlm_type = dlm_type
@@ -245,4 +247,4 @@ class EfficientDLMConfig(PretrainedConfig):
         )
-__all__ = ["EfficientDLMConfig"]

 logger = logging.get_logger(__name__)
+class NVRDiffConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
     Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
         max_window_layers=28,
         attention_dropout=0.0,
         attn_implementation="sdpa",
+        seq_length=1024,
         mask_token_id=-1,
         dlm_type='llada',
         random_length_prob=None,
         rope_config_validation(self)
         self.attn_implementation = attn_implementation
+        self.seq_length = seq_length
         self.mask_token_id = mask_token_id
         self.dlm_type = dlm_type
         )
+__all__ = ["Qwen3Config"]

images/result.png → model-00001-of-00002.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b81fe6641cd8816c4041697b0ac2cb1c4fcdfc2166504e2bde174c67ddc7eae
-size 221103

 version https://git-lfs.github.com/spec/v1
+oid sha256:42a85e2aa98cd482ece3ec213560fa67c1e15cbfa2a58c366e2c516887e50927
+size 4967215816

model.safetensors → model-00002-of-00002.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77c83e52654fd49874f6b09cf78b739da454c8320dd54c6970c3e5f88dc5e7c4
-size 8822895320

 version https://git-lfs.github.com/spec/v1
+oid sha256:fcc2f6d41ac9fec18b6593d91efb2d1cd5abf7c76433d98887d65d8306b96523
+size 3855679488

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,406 @@

+{
+  "metadata": {
+    "total_size": 8822848512
+  },
+  "weight_map": {
+    "diffusion_head.weight": "model-00002-of-00002.safetensors",
+    "encoder.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "encoder.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "encoder.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

modeling_edlm.py → modeling_nvrdiff.py RENAMED Viewed

@@ -22,7 +22,7 @@ from transformers.generation import GenerationMixin
 import math
 from .modeling_qwen3 import Qwen3Model, Qwen3PreTrainedModel, Qwen3Attention, apply_rotary_pos_emb, repeat_kv
-from .configuration_edlm import EfficientDLMConfig
 from .chat_utils import generate_with_prefix_cache_block_diff
 # @torch.compile(dynamic=True, mode="reduce-overhead")
@@ -37,32 +37,46 @@ class Qwen3FlexAttention(Qwen3Attention):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.block_size = self.block_size_orig = self.config.block_size
-        self.bidirectional_mask = None
         if self.config.dlm_paradigm == 'bidirectional':
             self.bidirectional_mask = self.compute_block_mask(mode='bidirectional')
         elif self.config.dlm_paradigm == 'block_diff':
-            self.block_diff_mask = None
         else:
             raise ValueError(f"Unknown attention mode: {self.config.dlm_paradigm}")
         self.mode = 'bidirectional'
         import torch._dynamo.config as dcfg
         dcfg.cache_size_limit = 512
-    def set_attention_mode(self, mode, block_size=None):
         self.mode = mode
         self.block_size = block_size
-    def compute_block_mask(self, mode, q_len, block_size=None):
         def bidirectional_mask(b, h, q, kv):
             return (q >= kv) | (q < kv)
         def block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
             """
             Constructs the specialized block diffusion attention mask for training
@@ -70,11 +84,13 @@ class Qwen3FlexAttention(Qwen3Attention):
             - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
             - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
             - **Block Causal Mask (M_BC)**: Attention to update x0
             Args:
                 b, h: Batch and head indices (ignored for mask logic).
                 q_idx, kv_idx: Query and Key indices.
                 seq_len: Total sequence length.
                 block_size: Defines the block structure.
             Returns:
                 A boolean attention mask.
             """
@@ -109,14 +125,28 @@ class Qwen3FlexAttention(Qwen3Attention):
         if mode == 'bidirectional':
             attn_mask = bidirectional_mask
         elif mode == 'block_diff':
             assert block_size is not None
-            attn_mask = lambda b, h, q, kv: block_diff_mask(block_size, b, h, q, kv, q_len//2)
         else:
             raise ValueError(f"Unknown attention mode: {mode}")
         block_mask = create_block_mask(
-            attn_mask, B=None, H=None, Q_LEN=q_len, KV_LEN=q_len
         )
         return block_mask
@@ -166,12 +196,28 @@ class Qwen3FlexAttention(Qwen3Attention):
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if self.mode == 'bidirectional':
-            if self.bidirectional_mask is None or q_len != self.bidirectional_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='bidirectional', q_len=q_len)
             else:
                 block_mask = self.bidirectional_mask
         elif self.mode == 'block_diff':
-            if self.block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
                 block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
             else:
                 block_mask = self.block_diff_mask
@@ -195,14 +241,14 @@ def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
     return mask
-class EfficientDLM(Qwen3PreTrainedModel, GenerationMixin):
     """
     A single model with:
       - a bidirectional encoder + diffusion‐LM head over A
       - a causal decoder + LM head over B, conditioned on F_A
     """
-    def __init__(self, config: EfficientDLMConfig):
         super().__init__(config)
         self.mask_token_id = config.mask_token_id
@@ -210,7 +256,7 @@ class EfficientDLM(Qwen3PreTrainedModel, GenerationMixin):
         diffusion_config = copy.deepcopy(config)
         diffusion_config.diffusion_lm = True
-        if config.dlm_paradigm in ['block_diff']:
             diffusion_config.attn_class = Qwen3FlexAttention
         elif config.dlm_paradigm in ['bidirectional', 'autoregressive']:
             diffusion_config.attn_class = Qwen3Attention
@@ -256,13 +302,16 @@ class EfficientDLM(Qwen3PreTrainedModel, GenerationMixin):
     ):
         """
         Two-stage corruption with optional per-block sampling.
         • Stage 1:  m ~ U(eps, 1)   →   k = round(m · len)  (exact budget).
         • Stage 2:  sample exactly k positions with weights
                     w_i(m) = exp[ λ · (1−m) · i ]   (late-heavy when m→0,
                                                      uniform when m→1).
           If `block_size` is given, the procedure is run *independently*
           inside each contiguous block of that length (last block may be shorter).
           When block_size is provided, m is sampled per-block and p_mask is per-block.
         Args
         ----
         input_ids : (B, L)  LongTensor
@@ -350,73 +399,81 @@ class EfficientDLM(Qwen3PreTrainedModel, GenerationMixin):
         masked_indices: Optional[torch.Tensor]   = None,
         p_mask: Optional[torch.Tensor]           = None,
         loss_mask: Optional[torch.Tensor] = None,
-        skip_loss: bool = False,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
-        if inputs_embeds is not None:
-            noisy_inputs = None
-        else:
-            batch_size, seq_len = input_ids.shape
-            if self.config.dlm_paradigm == 'bidirectional':
-                if labels is not None and torch.rand(1) < self.config.random_length_prob:
-                    random_length = torch.randint(2, input_ids.shape[1] + 1, (1,))
-                    input_ids = input_ids[:, :random_length]
-                    labels = labels[:, :random_length]
-                    if attention_mask is not None:
-                        attention_mask = attention_mask[:, :random_length]
-                    if position_ids is not None:
-                        position_ids = position_ids[:, :random_length]
-            elif self.config.dlm_paradigm == 'block_diff':
-                if labels is not None and block_size is None:
-                    if torch.rand(1) < self.config.random_length_prob:
-                        block_size = torch.randint(1, 8, (1,)).item() * 4  ## [4, 32] divisible by 4
-                    else:
-                        block_size = self.config.block_size
-            if labels is not None and self.config.dlm_paradigm != 'autoregressive':
-                if masked_indices is not None:
-                    #assert p_mask is not None
-                    if loss_mask is not None:
-                        masked_indices[loss_mask == 0] = 0
-                    noisy_inputs = torch.where(masked_indices, self.mask_token_id, input_ids)
-                else:
-                    if self.config.tok_mask_half_life_ratio is not None:
-                        noisy_inputs, masked_indices, p_mask = self.forward_process_exp(input_ids, eps=eps, block_size=block_size, half_life_ratio=self.config.tok_mask_half_life_ratio, loss_mask=loss_mask)
-                    else:
-                        noisy_inputs, masked_indices, p_mask = self.forward_process(input_ids, eps=eps, block_size=block_size, loss_mask=loss_mask)
             else:
-                noisy_inputs = input_ids
-                masked_indices = None
-                p_mask = None
-            if self.config.dlm_paradigm in ['block_diff']:
-                for layer in self.encoder.layers:
-                    if hasattr(layer.self_attn, 'set_attention_mode'):
-                        layer.self_attn.set_attention_mode(self.config.dlm_paradigm, block_size=block_size)
-            input_ids_len = noisy_inputs.shape[1]
-            if labels is not None and self.config.dlm_paradigm == 'block_diff':
-                if position_ids is None:
-                    position_ids = torch.arange(input_ids_len, device=noisy_inputs.device).unsqueeze(0)
-                noisy_inputs = torch.cat([noisy_inputs, input_ids], dim=1)
-            if block_diff_ppl:
-                if position_ids is None:
-                    position_ids = torch.arange(input_ids_len // 2, device=noisy_inputs.device).unsqueeze(0)
         enc_out  = self.encoder(
             past_key_values=past_key_values,
             input_ids=noisy_inputs,
-            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             position_ids=position_ids,
             is_training=(labels is not None) or (block_diff_ppl),
@@ -429,56 +486,56 @@ class EfficientDLM(Qwen3PreTrainedModel, GenerationMixin):
             logits = logits[:, :input_ids_len]
         loss = None
-        if labels is not None and not skip_loss:
-            if self.config.dlm_paradigm == 'autoregressive':
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                if loss_mask is None:
-                    loss_fct = CrossEntropyLoss()
-                    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
-                    shift_labels = shift_labels.view(-1)
-                    loss = loss_fct(shift_logits, shift_labels)
-                else:
-                    loss_mask = loss_mask[..., 1:].contiguous()
-                    loss_fct = CrossEntropyLoss(reduction='none')
-                    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
-                    shift_labels = shift_labels.view(-1)
-                    shift_labels = shift_labels.to(shift_logits.device)
-                    token_losses = loss_fct(shift_logits, shift_labels)
-                    loss = token_losses[loss_mask].sum() / loss_mask.sum()
-            else:
-                # Handle DREAM vs LLADA style losses
-                if hasattr(self.config, 'dlm_type') and self.config.dlm_type == 'dream':
-                    logits = logits[..., :-1, :].contiguous()
-                    labels = labels[..., 1:].contiguous()
-                    masked_indices = masked_indices[:, 1:]
-                    p_mask = p_mask[:, 1:]
-                # Calculate token-wise cross entropy loss for masked positions in B
-                token_loss = torch.nn.functional.cross_entropy(
-                    logits[masked_indices],
-                    labels[masked_indices],
-                    reduction='none'
-                ) / p_mask[masked_indices]
-                loss = token_loss.sum() / masked_indices.sum()
         return CausalLMOutputWithPast(
             loss=loss if not is_teacher else logits,
             logits=logits,
             past_key_values=enc_out.past_key_values,
-            hidden_states=enc_out.last_hidden_state,
             attentions=None,
         )
-    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, temperature=0):
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
@@ -489,7 +546,6 @@ class EfficientDLM(Qwen3PreTrainedModel, GenerationMixin):
                         mask_id=self.mask_token_id,
                         threshold=threshold,
                         shift_logits=shift_logits,
-                        temperature=temperature,
                         neg_entropy=False,
                     )

 import math
 from .modeling_qwen3 import Qwen3Model, Qwen3PreTrainedModel, Qwen3Attention, apply_rotary_pos_emb, repeat_kv
+from .configuration_nvrdiff import NVRDiffConfig
 from .chat_utils import generate_with_prefix_cache_block_diff
 # @torch.compile(dynamic=True, mode="reduce-overhead")
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.max_seq_length = self.config.seq_length
+        self.prefix_len_orig = int(self.config.seq_length * self.config.prefix_ratio)
+        self.block_size_orig = self.config.block_size
         if self.config.dlm_paradigm == 'bidirectional':
             self.bidirectional_mask = self.compute_block_mask(mode='bidirectional')
+        elif self.config.dlm_paradigm == 'prefix_bidirectional':
+            self.prefix_bidirectional_mask = self.compute_block_mask(mode='prefix_bidirectional', prefix_len=self.prefix_len_orig)
+        elif self.config.dlm_paradigm == 'efficient_block_diff':
+            self.efficient_block_diff_mask = self.compute_block_mask(mode='efficient_block_diff', block_size=self.block_size_orig)
         elif self.config.dlm_paradigm == 'block_diff':
+            self.block_diff_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size_orig)
         else:
             raise ValueError(f"Unknown attention mode: {self.config.dlm_paradigm}")
+        self.prefix_len = self.prefix_len_orig
+        self.block_size = self.block_size_orig
         self.mode = 'bidirectional'
         import torch._dynamo.config as dcfg
         dcfg.cache_size_limit = 512
+    def set_attention_mode(self, mode, prefix_len=None, block_size=None):
         self.mode = mode
+        self.prefix_len = prefix_len
         self.block_size = block_size
+    def compute_block_mask(self, mode, prefix_len=None, q_len=None, block_size=None):
         def bidirectional_mask(b, h, q, kv):
             return (q >= kv) | (q < kv)
+        def prefix_bidirectional_mask(prefix_len, b, h, q, kv):
+            return (kv <= prefix_len) | (q >= prefix_len)
+        def efficient_block_diff_mask(block_size, b, h, q, kv):
+            return (q // block_size) >= (kv // block_size)
         def block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
             """
             Constructs the specialized block diffusion attention mask for training
             - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
             - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
             - **Block Causal Mask (M_BC)**: Attention to update x0
             Args:
                 b, h: Batch and head indices (ignored for mask logic).
                 q_idx, kv_idx: Query and Key indices.
                 seq_len: Total sequence length.
                 block_size: Defines the block structure.
             Returns:
                 A boolean attention mask.
             """
         if mode == 'bidirectional':
             attn_mask = bidirectional_mask
+        elif mode == 'prefix_bidirectional':
+            assert prefix_len is not None
+            attn_mask = lambda b, h, q, kv: prefix_bidirectional_mask(prefix_len, b, h, q, kv)
+        elif mode == 'efficient_block_diff':
+            assert block_size is not None
+            attn_mask = lambda b, h, q, kv: efficient_block_diff_mask(block_size, b, h, q, kv)
         elif mode == 'block_diff':
             assert block_size is not None
+            attn_mask = lambda b, h, q, kv: block_diff_mask(block_size, b, h, q, kv, self.max_seq_length)
         else:
             raise ValueError(f"Unknown attention mode: {mode}")
+        if q_len is not None:
+            Q_LEN = q_len
+        else:
+            if mode == 'block_diff':
+                Q_LEN = self.max_seq_length * 2
+            else:
+                Q_LEN = self.max_seq_length
         block_mask = create_block_mask(
+            attn_mask, B=None, H=None, Q_LEN=Q_LEN, KV_LEN=Q_LEN
         )
         return block_mask
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if self.mode == 'bidirectional':
+            if q_len != self.bidirectional_mask.shape[-2]:
+                block_mask = self.compute_block_mask(mode='bidirectional', prefix_len=self.prefix_len, q_len=q_len)
             else:
                 block_mask = self.bidirectional_mask
+        elif self.mode == 'prefix_bidirectional':
+            if self.prefix_len != self.prefix_len_orig or q_len != self.prefix_bidirectional_mask.shape[-2]:
+                block_mask = self.compute_block_mask(mode='prefix_bidirectional', prefix_len=self.prefix_len, q_len=q_len)
+                # print('create new block mask length for:',self.prefix_len)
+                # print(f"Block mask shape: {block_mask.shape}")
+                # print("Block mask pattern:")
+                # print(block_mask)
+            else:
+                block_mask = self.prefix_bidirectional_mask
+        elif self.mode == 'efficient_block_diff':
+            if self.block_size != self.block_size_orig or q_len != self.efficient_block_diff_mask.shape[-2]:
+                block_mask = self.compute_block_mask(mode='efficient_block_diff', block_size=self.block_size, q_len=q_len)
+            else:
+                block_mask = self.efficient_block_diff_mask
         elif self.mode == 'block_diff':
+            if self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
                 block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
             else:
                 block_mask = self.block_diff_mask
     return mask
+class DiffEncoderModel(Qwen3PreTrainedModel, GenerationMixin):
     """
     A single model with:
       - a bidirectional encoder + diffusion‐LM head over A
       - a causal decoder + LM head over B, conditioned on F_A
     """
+    def __init__(self, config: NVRDiffConfig):
         super().__init__(config)
         self.mask_token_id = config.mask_token_id
         diffusion_config = copy.deepcopy(config)
         diffusion_config.diffusion_lm = True
+        if config.dlm_paradigm in ['prefix_bidirectional', 'efficient_block_diff', 'block_diff']:
             diffusion_config.attn_class = Qwen3FlexAttention
         elif config.dlm_paradigm in ['bidirectional', 'autoregressive']:
             diffusion_config.attn_class = Qwen3Attention
     ):
         """
         Two-stage corruption with optional per-block sampling.
         • Stage 1:  m ~ U(eps, 1)   →   k = round(m · len)  (exact budget).
         • Stage 2:  sample exactly k positions with weights
                     w_i(m) = exp[ λ · (1−m) · i ]   (late-heavy when m→0,
                                                      uniform when m→1).
           If `block_size` is given, the procedure is run *independently*
           inside each contiguous block of that length (last block may be shorter).
           When block_size is provided, m is sampled per-block and p_mask is per-block.
         Args
         ----
         input_ids : (B, L)  LongTensor
         masked_indices: Optional[torch.Tensor]   = None,
         p_mask: Optional[torch.Tensor]           = None,
         loss_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
+        batch_size, seq_len = input_ids.shape
+        if self.config.dlm_paradigm == 'bidirectional':
+            if labels is not None and torch.rand(1) < self.config.random_length_prob:
+                random_length = torch.randint(2, input_ids.shape[1] + 1, (1,))
+                input_ids = input_ids[:, :random_length]
+                labels = labels[:, :random_length]
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, :random_length]
+                if position_ids is not None:
+                    position_ids = position_ids[:, :random_length]
+        elif self.config.dlm_paradigm == 'prefix_bidirectional':
+            if labels is not None and split_len is None:
+                if torch.rand(1) < self.config.random_length_prob:
+                    split_len = torch.randint(1, seq_len//64, (1,)).item() * 64  ## [64, seq_len] divisible by 64
+                else:
+                    split_len = int(seq_len * self.config.prefix_ratio)
+        elif self.config.dlm_paradigm == 'efficient_block_diff':
+            if labels is not None and block_size is None:
+                if torch.rand(1) < self.config.random_length_prob:
+                    block_size = torch.randint(1, 8, (1,)).item() * 4  ## [4, 32] divisible by 4
+                else:
+                    block_size = self.config.block_size
+        elif self.config.dlm_paradigm == 'block_diff':
+            if labels is not None and block_size is None:
+                if torch.rand(1) < self.config.random_length_prob:
+                    block_size = torch.randint(1, 8, (1,)).item() * 4  ## [4, 32] divisible by 4
+                else:
+                    block_size = self.config.block_size
+        if labels is not None and self.config.dlm_paradigm != 'autoregressive':
+            if masked_indices is not None:
+                #assert p_mask is not None
+                if loss_mask is not None:
+                    masked_indices[loss_mask == 0] = 0
+                noisy_inputs = torch.where(masked_indices, self.mask_token_id, input_ids)
             else:
+                if self.config.tok_mask_half_life_ratio is not None:
+                    noisy_inputs, masked_indices, p_mask = self.forward_process_exp(input_ids, eps=eps, block_size=block_size, half_life_ratio=self.config.tok_mask_half_life_ratio, loss_mask=loss_mask)
+                else:
+                    noisy_inputs, masked_indices, p_mask = self.forward_process(input_ids, eps=eps, block_size=block_size, loss_mask=loss_mask)
+        else:
+            noisy_inputs = input_ids
+            masked_indices = None
+            p_mask = None
+        if self.config.dlm_paradigm in ['prefix_bidirectional', 'efficient_block_diff', 'block_diff']:
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'set_attention_mode'):
+                    layer.self_attn.set_attention_mode(self.config.dlm_paradigm, prefix_len=split_len, block_size=block_size)
+        input_ids_len = noisy_inputs.shape[1]
+        if labels is not None and self.config.dlm_paradigm == 'block_diff':
+            if position_ids is None:
+                position_ids = torch.arange(input_ids_len, device=noisy_inputs.device).unsqueeze(0)
+            noisy_inputs = torch.cat([noisy_inputs, input_ids], dim=1)
+        if block_diff_ppl:
+            if position_ids is None:
+                position_ids = torch.arange(input_ids_len // 2, device=noisy_inputs.device).unsqueeze(0)
         enc_out  = self.encoder(
             past_key_values=past_key_values,
             input_ids=noisy_inputs,
             attention_mask=attention_mask,
             position_ids=position_ids,
             is_training=(labels is not None) or (block_diff_ppl),
             logits = logits[:, :input_ids_len]
         loss = None
+        # if labels is not None:
+        #     if self.config.dlm_paradigm == 'autoregressive':
+        #         shift_logits = logits[..., :-1, :].contiguous()
+        #         shift_labels = labels[..., 1:].contiguous()
+        #         if loss_mask is None:
+        #             loss_fct = CrossEntropyLoss()
+        #             shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        #             shift_labels = shift_labels.view(-1)
+        #             loss = loss_fct(shift_logits, shift_labels)
+        #         else:
+        #             loss_mask = loss_mask[..., 1:].contiguous()
+        #             loss_fct = CrossEntropyLoss(reduction='none')
+        #             shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        #             shift_labels = shift_labels.view(-1)
+        #             shift_labels = shift_labels.to(shift_logits.device)
+        #             token_losses = loss_fct(shift_logits, shift_labels)
+        #             loss = token_losses[loss_mask].sum() / loss_mask.sum()
+        #     else:
+        #         # Handle DREAM vs LLADA style losses
+        #         if hasattr(self.config, 'dlm_type') and self.config.dlm_type == 'dream':
+        #             logits = logits[..., :-1, :].contiguous()
+        #             labels = labels[..., 1:].contiguous()
+        #             masked_indices = masked_indices[:, 1:]
+        #             p_mask = p_mask[:, 1:]
+        #         # Calculate token-wise cross entropy loss for masked positions in B
+        #         token_loss = torch.nn.functional.cross_entropy(
+        #             logits[masked_indices],
+        #             labels[masked_indices],
+        #             reduction='none'
+        #         ) / p_mask[masked_indices]
+        #         loss = token_loss.sum() / masked_indices.sum()
         return CausalLMOutputWithPast(
             loss=loss if not is_teacher else logits,
             logits=logits,
             past_key_values=enc_out.past_key_values,
+            hidden_states=None,
             attentions=None,
         )
+    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold):
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
                         mask_id=self.mask_token_id,
                         threshold=threshold,
                         shift_logits=shift_logits,
                         neg_entropy=False,
                     )

modeling_qwen3.py CHANGED Viewed

@@ -35,14 +35,8 @@ from transformers.modeling_outputs import (
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
-from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
-try:
-    from transformers.utils import TransformersKwargs
-except ImportError:
-    from typing import TypedDict
-    class TransformersKwargs(TypedDict, total=False):
-        pass
-from .configuration_edlm import EfficientDLMConfig
 if is_torch_flex_attn_available():
@@ -166,7 +160,7 @@ def eager_attention_forward(
 class Qwen3Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: EfficientDLMConfig, layer_idx: int):
         super().__init__()
         self.config = config
@@ -312,7 +306,7 @@ class Qwen3Attention(nn.Module):
 class Qwen3DecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: EfficientDLMConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if hasattr(config, 'attn_class'):
@@ -383,7 +377,7 @@ class Qwen3DecoderLayer(GradientCheckpointingLayer):
 @auto_docstring
 class Qwen3PreTrainedModel(PreTrainedModel):
-    config_class = EfficientDLMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Qwen3DecoderLayer"]
@@ -411,7 +405,7 @@ class Qwen3PreTrainedModel(PreTrainedModel):
 class Qwen3RotaryEmbedding(nn.Module):
-    def __init__(self, config: EfficientDLMConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
@@ -446,7 +440,7 @@ class Qwen3RotaryEmbedding(nn.Module):
 @auto_docstring
 class Qwen3Model(Qwen3PreTrainedModel):
-    def __init__(self, config: EfficientDLMConfig):
         super().__init__(config)
         self.config = config
@@ -696,7 +690,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
         dtype: torch.dtype,
         cache_position: torch.Tensor,
         batch_size: int,
-        config: EfficientDLMConfig,
         past_key_values: Cache,
     ):
         """
@@ -716,7 +710,7 @@ class Qwen3Model(Qwen3PreTrainedModel):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
-            config (`EfficientDLMConfig`):
                 The model's configuration class
             past_key_values (`Cache`):
                 The cache class that is being used currently to generate

 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from .configuration_nvrdiff import NVRDiffConfig
 if is_torch_flex_attn_available():
 class Qwen3Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: NVRDiffConfig, layer_idx: int):
         super().__init__()
         self.config = config
 class Qwen3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: NVRDiffConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if hasattr(config, 'attn_class'):
 @auto_docstring
 class Qwen3PreTrainedModel(PreTrainedModel):
+    config_class = NVRDiffConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Qwen3DecoderLayer"]
 class Qwen3RotaryEmbedding(nn.Module):
+    def __init__(self, config: NVRDiffConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
 @auto_docstring
 class Qwen3Model(Qwen3PreTrainedModel):
+    def __init__(self, config: NVRDiffConfig):
         super().__init__(config)
         self.config = config
         dtype: torch.dtype,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: NVRDiffConfig,
         past_key_values: Cache,
     ):
         """
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`NVRDiffConfig`):
                 The model's configuration class
             past_key_values (`Cache`):
                 The cache class that is being used currently to generate