Instructions to use nvidia/Efficient-DLM-4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Efficient-DLM-4B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Efficient-DLM-4B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True)
model = AutoModel.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Efficient-DLM-4B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Efficient-DLM-4B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Efficient-DLM-4B

SGLang

How to use nvidia/Efficient-DLM-4B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Efficient-DLM-4B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Efficient-DLM-4B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Efficient-DLM-4B with Docker Model Runner:
```
docker model run hf.co/nvidia/Efficient-DLM-4B
```

YongganFu commited on Sep 4, 2025

Commit

b4c3108

verified ·

1 Parent(s): ad6359b

Upload model

Browse files

Files changed (2) hide show

chat_utils.py +60 -34
modeling_nvrdiff.py +2 -2

chat_utils.py CHANGED Viewed

@@ -1,54 +1,75 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import os
-import sys
-import argparse
-import random
 import numpy as np
 import torch
 import torch.nn.functional as F
-from transformers import AutoTokenizer
-def get_transfer_index(
-    logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False
-):
-    x0 = torch.argmax(logits, dim=-1)  # (B, L)
-    if remasking == "low_confidence":
         p = F.softmax(logits, dim=-1)
-        x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
-    elif remasking == "random":
         x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
     else:
         raise NotImplementedError(remasking)
     if neg_entropy:
         p = F.softmax(logits, dim=-1)
         epsilon = 1e-10
         log_probs = torch.log(p + epsilon)
-        confidence_scores = torch.sum(p * log_probs, dim=-1)
     else:
         confidence_scores = x0_p
     x0 = torch.where(mask_index, x0, x)
-    confidence = torch.where(mask_index, confidence_scores, torch.tensor(float("-inf"), device=x0.device))
     transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
     if threshold is not None:
         num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
     for j in range(confidence.shape[0]):
-        k = int(num_transfer_tokens[j])
-        k = max(k, 1)
-        _, select_index = torch.topk(confidence[j], k=k)
         transfer_index[j, select_index] = True
         if threshold is not None:
-            for kk in range(k):
-                if confidence[j, select_index[kk]] < threshold:
-                    transfer_index[j, select_index[kk]] = False
     return x0, transfer_index
@@ -62,20 +83,20 @@ def get_num_transfer_tokens(mask_index, steps: int):
     return num_transfer_tokens
 @torch.no_grad()
 def generate_with_prefix_cache_block_diff(
     model,
     prompt,
     steps=128,
     gen_length=128,
-    block_length=32,
     temperature=0.,
     remasking='low_confidence',
-    mask_id=151662,
     threshold=None,
-    shift_logits=True,
-    neg_entropy=True
 ):
     dream_style=shift_logits
     # Initialize the accumulator
@@ -114,7 +135,12 @@ def generate_with_prefix_cache_block_diff(
         # Build the initial mask for this block
         mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
-        schedule_mask = mask_block_idx0
         num_transfer_tokens = get_num_transfer_tokens(schedule_mask, steps_per_block)  # (B, steps)

 import numpy as np
 import torch
 import torch.nn.functional as F
+def get_transfer_index(logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None,neg_entropy=False):
+    x0 = torch.argmax(logits, dim=-1) # b, l
+    if remasking == 'low_confidence':
+        # p = F.softmax(logits.to(torch.float64), dim=-1)
         p = F.softmax(logits, dim=-1)
+        x0_p = torch.squeeze(
+            torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+    elif remasking == 'top_p_margin':
+        # Compute probabilities
+        p = F.softmax(logits, dim=-1)                       # (B, L, V)
+        # Top-2 per position
+        top2 = torch.topk(p, k=2, dim=-1).values            # (B, L, 2)
+        margin = top2[..., 0] - top2[..., 1]                # (B, L)
+        # Normalize margin to [0,1] over MASKED positions per row
+        plus_inf  = torch.full_like(margin, float('inf'))
+        minus_inf = torch.full_like(margin, float('-inf'))
+        masked_for_min = torch.where(mask_index, margin, plus_inf)
+        masked_for_max = torch.where(mask_index, margin, minus_inf)
+        row_min = masked_for_min.amin(dim=1, keepdim=True)  # (B, 1)
+        row_max = masked_for_max.amax(dim=1, keepdim=True)  # (B, 1)
+        denom = (row_max - row_min)
+        # If denom==0 (all equal), set normalized=1 on masked; 0 elsewhere by default
+        normalized = torch.zeros_like(margin)
+        nonzero = denom > 0
+        normalized = torch.where(
+            mask_index & nonzero,
+            (margin - row_min) / (denom + 1e-12),
+            normalized
+        )
+        normalized = torch.where(
+            mask_index & (~nonzero),
+            torch.ones_like(normalized),
+            normalized
+        )
+        x0_p = normalized  # ∈ [0,1] on masked positions
+    elif remasking == 'random':
         x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
     else:
         raise NotImplementedError(remasking)
+    # Calculate negative entropy if requested
     if neg_entropy:
+        # p = F.softmax(logits.to(torch.float64), dim=-1)
         p = F.softmax(logits, dim=-1)
         epsilon = 1e-10
         log_probs = torch.log(p + epsilon)
+        confidence_scores = torch.sum(p * log_probs, dim=-1)  # negative entropy per position
     else:
         confidence_scores = x0_p
     x0 = torch.where(mask_index, x0, x)
+    confidence = torch.where(mask_index, confidence_scores, -np.inf)
     transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
     if threshold is not None:
         num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
+    # print(f'confidence: {confidence}')
     for j in range(confidence.shape[0]):
+        _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j])
         transfer_index[j, select_index] = True
         if threshold is not None:
+            for k in range(1, num_transfer_tokens[j]):
+                if confidence[j, select_index[k]] < threshold:
+                    transfer_index[j, select_index[k]] = False
     return x0, transfer_index
     return num_transfer_tokens
 @torch.no_grad()
 def generate_with_prefix_cache_block_diff(
     model,
     prompt,
     steps=128,
     gen_length=128,
+    block_length=128,
     temperature=0.,
     remasking='low_confidence',
+    mask_id=126336,
     threshold=None,
+    factor=None,
+    shift_logits=False,
+    neg_entropy=False,
 ):
     dream_style=shift_logits
     # Initialize the accumulator
         # Build the initial mask for this block
         mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
+        # Precompute the transfer schedule for this block
+        if dream_style:
+            # still denoise *all* positions (0..Lb-1), since none are seeded
+            schedule_mask = mask_block_idx0
+        else:
+            schedule_mask = mask_block_idx0
         num_transfer_tokens = get_num_transfer_tokens(schedule_mask, steps_per_block)  # (B, steps)

modeling_nvrdiff.py CHANGED Viewed

@@ -546,7 +546,7 @@ class DiffEncoderModel(Qwen3PreTrainedModel, GenerationMixin):
                         mask_id=self.mask_token_id,
                         threshold=threshold,
                         shift_logits=True,
-                        neg_entropy=True,
                     )
         return out_ids, nfe

                         mask_id=self.mask_token_id,
                         threshold=threshold,
                         shift_logits=True,
+                        neg_entropy=False,
                     )
         return out_ids, nfe