Text Generation
Transformers
Safetensors
qwen3
feature-extraction
conversational
custom_code
text-generation-inference
Instructions to use nvidia/Efficient-DLM-4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use nvidia/Efficient-DLM-4B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="nvidia/Efficient-DLM-4B", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True) model = AutoModel.from_pretrained("nvidia/Efficient-DLM-4B", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use nvidia/Efficient-DLM-4B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "nvidia/Efficient-DLM-4B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "nvidia/Efficient-DLM-4B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/nvidia/Efficient-DLM-4B
- SGLang
How to use nvidia/Efficient-DLM-4B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "nvidia/Efficient-DLM-4B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "nvidia/Efficient-DLM-4B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "nvidia/Efficient-DLM-4B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "nvidia/Efficient-DLM-4B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use nvidia/Efficient-DLM-4B with Docker Model Runner:
docker model run hf.co/nvidia/Efficient-DLM-4B
Upload model
Browse files- chat_utils.py +0 -13
- modeling_nvrdiff.py +15 -38
chat_utils.py
CHANGED
|
@@ -11,20 +11,7 @@ import torch
|
|
| 11 |
import torch.nn.functional as F
|
| 12 |
from transformers import AutoTokenizer
|
| 13 |
|
| 14 |
-
sys.path.insert(1, "/lustre/fsw/portfolios/nvr/users/yongganf/adlr-megatron-lm")
|
| 15 |
-
from get_hf_model import get_torchtitan_model_sft # noqa: E402
|
| 16 |
|
| 17 |
-
|
| 18 |
-
# --------------------------- Reproducibility ----------------------------------
|
| 19 |
-
def set_seed(seed: int = 42):
|
| 20 |
-
torch.manual_seed(seed)
|
| 21 |
-
random.seed(seed)
|
| 22 |
-
np.random.seed(seed)
|
| 23 |
-
torch.backends.cudnn.deterministic = True
|
| 24 |
-
torch.backends.cudnn.benchmark = False
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
# -------------------- Diffusion helpers (unchanged logic) --------------------
|
| 28 |
def get_transfer_index(
|
| 29 |
logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False
|
| 30 |
):
|
|
|
|
| 11 |
import torch.nn.functional as F
|
| 12 |
from transformers import AutoTokenizer
|
| 13 |
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def get_transfer_index(
|
| 16 |
logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False
|
| 17 |
):
|
modeling_nvrdiff.py
CHANGED
|
@@ -535,41 +535,18 @@ class DiffEncoderModel(Qwen3PreTrainedModel, GenerationMixin):
|
|
| 535 |
)
|
| 536 |
|
| 537 |
|
| 538 |
-
def
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
out_ids, nfe = generate_with_prefix_cache_block_diff(
|
| 554 |
-
model=self,
|
| 555 |
-
prompt=prompt_ids,
|
| 556 |
-
gen_length=max_new_tokens,
|
| 557 |
-
steps=steps,
|
| 558 |
-
block_length=block_length,
|
| 559 |
-
remasking="low_confidence",
|
| 560 |
-
mask_id=self.mask_token_id,
|
| 561 |
-
threshold=threshold,
|
| 562 |
-
shift_logits=True,
|
| 563 |
-
neg_entropy=True,
|
| 564 |
-
)
|
| 565 |
-
|
| 566 |
-
generated_tokens = out_ids[:, prompt_ids.shape[1]:]
|
| 567 |
-
tokenized_out = tokenizer.batch_decode(
|
| 568 |
-
generated_tokens,
|
| 569 |
-
skip_special_tokens=True
|
| 570 |
-
)[0]
|
| 571 |
-
print(f"Model: {tokenized_out}")
|
| 572 |
-
print(f"[nfe={nfe}]")
|
| 573 |
-
|
| 574 |
-
except KeyboardInterrupt:
|
| 575 |
-
print("\n[info] interrupted by user (Ctrl-C).")
|
|
|
|
| 535 |
)
|
| 536 |
|
| 537 |
|
| 538 |
+
def generate(self, prompt_ids, max_new_tokens, steps, block_length, threshold):
|
| 539 |
+
out_ids, nfe = generate_with_prefix_cache_block_diff(
|
| 540 |
+
model=self,
|
| 541 |
+
prompt=prompt_ids,
|
| 542 |
+
gen_length=max_new_tokens,
|
| 543 |
+
steps=steps,
|
| 544 |
+
block_length=block_length,
|
| 545 |
+
remasking="low_confidence",
|
| 546 |
+
mask_id=self.mask_token_id,
|
| 547 |
+
threshold=threshold,
|
| 548 |
+
shift_logits=True,
|
| 549 |
+
neg_entropy=True,
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
return out_ids, nfe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|