Instructions to use nvidia/Efficient-DLM-4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Efficient-DLM-4B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Efficient-DLM-4B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True)
model = AutoModel.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Efficient-DLM-4B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Efficient-DLM-4B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Efficient-DLM-4B

SGLang

How to use nvidia/Efficient-DLM-4B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Efficient-DLM-4B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Efficient-DLM-4B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Efficient-DLM-4B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Efficient-DLM-4B with Docker Model Runner:
```
docker model run hf.co/nvidia/Efficient-DLM-4B
```

YongganFu commited on Sep 3, 2025

Commit

41d881f

verified ·

1 Parent(s): 8c7caf4

Upload model

Browse files

Files changed (2) hide show

chat_utils.py +196 -0
modeling_nvrdiff.py +44 -4

chat_utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import sys
+import argparse
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+sys.path.insert(1, "/lustre/fsw/portfolios/nvr/users/yongganf/adlr-megatron-lm")
+from get_hf_model import get_torchtitan_model_sft  # noqa: E402
+# --------------------------- Reproducibility ----------------------------------
+def set_seed(seed: int = 42):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+# -------------------- Diffusion helpers (unchanged logic) --------------------
+def get_transfer_index(
+    logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False
+):
+    x0 = torch.argmax(logits, dim=-1)  # (B, L)
+    if remasking == "low_confidence":
+        p = F.softmax(logits, dim=-1)
+        x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
+    elif remasking == "random":
+        x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+    else:
+        raise NotImplementedError(remasking)
+    if neg_entropy:
+        p = F.softmax(logits, dim=-1)
+        epsilon = 1e-10
+        log_probs = torch.log(p + epsilon)
+        confidence_scores = torch.sum(p * log_probs, dim=-1)
+    else:
+        confidence_scores = x0_p
+    x0 = torch.where(mask_index, x0, x)
+    confidence = torch.where(mask_index, confidence_scores, torch.tensor(float("-inf"), device=x0.device))
+    transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+    if threshold is not None:
+        num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
+    for j in range(confidence.shape[0]):
+        k = int(num_transfer_tokens[j])
+        k = max(k, 1)
+        _, select_index = torch.topk(confidence[j], k=k)
+        transfer_index[j, select_index] = True
+        if threshold is not None:
+            for kk in range(k):
+                if confidence[j, select_index[kk]] < threshold:
+                    transfer_index[j, select_index[kk]] = False
+    return x0, transfer_index
+def get_num_transfer_tokens(mask_index, steps: int):
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
+    for i in range(mask_num.size(0)):
+        num_transfer_tokens[i, : int(remainder[i])] += 1
+    return num_transfer_tokens
+@torch.no_grad()
+def generate_with_prefix_cache_block_diff(
+    model,
+    prompt,
+    steps=128,
+    gen_length=128,
+    block_length=32,
+    temperature=0.,
+    remasking='low_confidence',
+    mask_id=151662,
+    threshold=None,
+    shift_logits=True,
+    neg_entropy=True
+):
+    dream_style=shift_logits
+    # Initialize the accumulator
+    x_accum = prompt.clone()
+    assert gen_length % block_length == 0
+    num_blocks = gen_length // block_length
+    assert steps % num_blocks == 0
+    steps_per_block = steps // num_blocks
+    nfe = 0
+    # Compute KV cache for the prompt initially
+    output = model(prompt, use_cache=True)
+    past_key_values = output.past_key_values
+    # For dream_style: store the "next token logit" of the context
+    next_logits_context = None
+    if dream_style:
+        next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
+    for num_block in range(num_blocks):
+        # Create a new block with mask tokens (no seeding)
+        mask_block = torch.ones(
+            (prompt.shape[0], block_length),
+            dtype=prompt.dtype,
+            device=prompt.device
+        ) * mask_id
+        # Append the block of masks
+        x_accum = torch.cat([x_accum, mask_block], dim=1)
+        current_block_start = prompt.size(1) + num_block * block_length
+        block_slice = slice(current_block_start, current_block_start + block_length)
+        # Build the initial mask for this block
+        mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
+        schedule_mask = mask_block_idx0
+        num_transfer_tokens = get_num_transfer_tokens(schedule_mask, steps_per_block)  # (B, steps)
+        # Denoise the current block
+        for i in range(steps_per_block):
+            mask_block_idx = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
+            if mask_block_idx.sum() == 0:
+                break
+            nfe += 1
+            # Forward only the current noisy block using cached context
+            logits_block = model(
+                x_accum[:, block_slice],
+                past_key_values=past_key_values,
+                use_cache=False
+            ).logits
+            if dream_style:
+                # Align logits so that each masked position has a predictor:
+                # prepend context-next logit, then use logits_block[:-1]
+                if block_length == 1:
+                    logits_use = next_logits_context              # (B, 1, V)
+                else:
+                    logits_use = torch.cat(
+                        [next_logits_context, logits_block[:, :-1, :]],
+                        dim=1
+                    )  # (B, Lb, V)
+                mask_use = mask_block_idx                        # (B, Lb)
+                x_use   = x_accum[:, block_slice]                # (B, Lb)
+                x0, transfer_idx = get_transfer_index(
+                    logits_use, temperature, remasking, mask_use, x_use,
+                    num_transfer_tokens=num_transfer_tokens[:, i],
+                    threshold=threshold, neg_entropy=neg_entropy
+                )
+                cur = x_accum[:, block_slice].clone()
+                cur[transfer_idx] = x0[transfer_idx]
+                x_accum[:, block_slice] = cur
+            else:
+                # non-AR (same-position) case
+                x0, transfer_idx = get_transfer_index(
+                    logits_block, temperature, remasking, mask_block_idx,
+                    x_accum[:, block_slice],
+                    num_transfer_tokens=num_transfer_tokens[:, i],
+                    threshold=threshold, neg_entropy=neg_entropy
+                )
+                cur = x_accum[:, block_slice].clone()
+                cur[transfer_idx] = x0[transfer_idx]
+                x_accum[:, block_slice] = cur
+        # after block is fully denoised, update KV cache
+        output = model(
+            x_accum[:, block_slice],
+            past_key_values=past_key_values,
+            use_cache=True
+        )
+        past_key_values = output.past_key_values
+        if dream_style and num_block < num_blocks - 1:
+            # refresh context-next logit for the next block
+            next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
+    return x_accum, nfe

modeling_nvrdiff.py CHANGED Viewed

@@ -7,9 +7,6 @@ import torch.nn.functional as F
 from torch import nn
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from .modeling_qwen3 import Qwen3Model, Qwen3PreTrainedModel, Qwen3Attention, apply_rotary_pos_emb, repeat_kv
-from .configuration_nvrdiff import NVRDiffConfig
 from torch.nn.attention.flex_attention import flex_attention, create_block_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
@@ -24,6 +21,10 @@ from transformers.generation import GenerationMixin
 import math
 # @torch.compile(dynamic=True, mode="reduce-overhead")
 # @torch.compile(mode="default")
 # @torch.compile(fullgraph=True, mode="reduce-overhead", dynamic=False)
@@ -532,4 +533,43 @@ class DiffEncoderModel(Qwen3PreTrainedModel, GenerationMixin):
             hidden_states=None,
             attentions=None,
         )

 from torch import nn
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from torch.nn.attention.flex_attention import flex_attention, create_block_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 import math
+from .modeling_qwen3 import Qwen3Model, Qwen3PreTrainedModel, Qwen3Attention, apply_rotary_pos_emb, repeat_kv
+from .configuration_nvrdiff import NVRDiffConfig
+from .chat_utils import generate_with_prefix_cache_block_diff
 # @torch.compile(dynamic=True, mode="reduce-overhead")
 # @torch.compile(mode="default")
 # @torch.compile(fullgraph=True, mode="reduce-overhead", dynamic=False)
             hidden_states=None,
             attentions=None,
         )
+    def chat(self, tokenizer, max_new_tokens, steps, block_length, threshold):
+        print("Stateless chat (type 'exit' to quit)")
+        print("------------------------------------")
+        try:
+            while True:
+                user_input = input("User: ").strip()
+                if user_input.lower() in {"exit", "quit", "q"}:
+                    print("Conversation ended.")
+                    break
+                prompt_ids = tokenizer(
+                    user_input,return_tensors='pt'
+                ).input_ids.to(device='cuda')
+                out_ids, nfe = generate_with_prefix_cache_block_diff(
+                    model=self,
+                    prompt=prompt_ids,
+                    gen_length=max_new_tokens,
+                    steps=steps,
+                    block_length=block_length,
+                    remasking="low_confidence",
+                    mask_id=self.mask_token_id,
+                    threshold=threshold,
+                    shift_logits=True,
+                    neg_entropy=True,
+                )
+                generated_tokens = out_ids[:, prompt_ids.shape[1]:]
+                tokenized_out = tokenizer.batch_decode(
+                    generated_tokens,
+                    skip_special_tokens=True
+                )[0]
+                print(f"Model: {tokenized_out}")
+                print(f"[nfe={nfe}]")
+        except KeyboardInterrupt:
+            print("\n[info] interrupted by user (Ctrl-C).")