Instructions to use JetLM/SDAR-1.7B-Chat with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use JetLM/SDAR-1.7B-Chat with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="JetLM/SDAR-1.7B-Chat", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("JetLM/SDAR-1.7B-Chat", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use JetLM/SDAR-1.7B-Chat with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "JetLM/SDAR-1.7B-Chat"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "JetLM/SDAR-1.7B-Chat",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/JetLM/SDAR-1.7B-Chat

SGLang

How to use JetLM/SDAR-1.7B-Chat with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "JetLM/SDAR-1.7B-Chat" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "JetLM/SDAR-1.7B-Chat",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "JetLM/SDAR-1.7B-Chat" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "JetLM/SDAR-1.7B-Chat",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use JetLM/SDAR-1.7B-Chat with Docker Model Runner:
```
docker model run hf.co/JetLM/SDAR-1.7B-Chat
```

remove LossKwargs

by kashif HF Staff - opened Jan 11

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+65

-23

Files changed (1) hide show

modeling_sdar.py +65 -23

modeling_sdar.py CHANGED Viewed

@@ -43,7 +43,7 @@ from transformers.modeling_outputs import (
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
-from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_sdar import SDARConfig
 from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
@@ -261,22 +261,41 @@ class SDARAttention(nn.Module):
         query_states, key_states = apply_rotary_pos_emb(
             query_states, key_states, cos, sin)
-        if past_key_value is not None and kwargs.get("store_kv", False):
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
             key_states, value_states = past_key_value.update(
                 key_states, value_states, self.layer_idx)
-        elif past_key_value is not None and not kwargs.get("store_kv", False) and len(past_key_value) > self.layer_idx:
-            # only retrive, do not store kv
-            past_key_states, past_value_states = past_key_value[self.layer_idx]
-            key_states = torch.cat(
-                [past_key_states, key_states], dim=-2
-                )
-            value_states = torch.cat(
-                [past_value_states, value_states], dim=-2
-                )
         attention_mask = attention_mask.bool() if attention_mask is not None else None
-        if torch.all(attention_mask):  # decoding
             query_states = query_states.transpose(1, 2)
             key_states = key_states.transpose(1, 2)
             value_states = value_states.transpose(1, 2)
@@ -329,7 +348,6 @@ class SDARDecoderLayer(GradientCheckpointingLayer):
         past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
-        store_kv: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         # necessary, but kept here for BC
         position_embeddings: Optional[Tuple[torch.Tensor,
@@ -347,7 +365,6 @@ class SDARDecoderLayer(GradientCheckpointingLayer):
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
-            store_kv=store_kv,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
             **kwargs,
@@ -394,9 +411,27 @@ class SDARPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, SDARRMSNorm):
             module.weight.data.fill_(1.0)
 class SDARRotaryEmbedding(nn.Module):
     def __init__(self, config: SDARConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
@@ -409,12 +444,18 @@ class SDARRotaryEmbedding(nn.Module):
         self.original_max_seq_len = config.max_position_embeddings
         self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        inv_freq, self.attention_scaling = self.rope_init_fn(
-            self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
     @torch.no_grad()
     # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -440,7 +481,10 @@ class SDARRotaryEmbedding(nn.Module):
 class SDARModel(SDARPreTrainedModel):
     def __init__(self, config: SDARConfig):
         super().__init__(config)
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(
@@ -472,7 +516,6 @@ class SDARModel(SDARPreTrainedModel):
         past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
-        store_kv: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
@@ -539,7 +582,6 @@ class SDARModel(SDARPreTrainedModel):
                 past_key_value=past_key_values,
                 output_attentions=output_attentions,
                 use_cache=use_cache,
-                store_kv=store_kv,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
                 **flash_attn_kwargs,
@@ -734,7 +776,7 @@ class SDARModel(SDARPreTrainedModel):
         return causal_mask
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
     ...

 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_sdar import SDARConfig
 from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
         query_states, key_states = apply_rotary_pos_emb(
             query_states, key_states, cos, sin)
+        # Standard transformers v5 cache convention: when a cache is provided, always `.update()` it.
+        # Callers that want a read-only forward should pass `past_key_values=None`, or use
+        # `DynamicCache.crop(prev_seq_len)` to roll back the append after reading the logits.
+        if past_key_value is not None:
             key_states, value_states = past_key_value.update(
                 key_states, value_states, self.layer_idx)
         attention_mask = attention_mask.bool() if attention_mask is not None else None
+        # I-DLM / strict-causal mode: rely on PyTorch's built-in `is_causal=True` path so GQA
+        # broadcasting works cleanly with a KV cache (query q_len ≠ key k_len). We compute a
+        # per-query offset such that `is_causal=True` masks against key position `q + offset`,
+        # matching the Dream-shifted causal-LM convention.
+        use_regular_causal = bool(getattr(self.config, "use_regular_causal", False))
+        if use_regular_causal:
+            q_len = query_states.shape[-2]
+            k_len = key_states.shape[-2]
+            if q_len == k_len:
+                attn_output = F.scaled_dot_product_attention(
+                    query=query_states, key=key_states, value=value_states,
+                    is_causal=True, scale=self.scaling, enable_gqa=True,
+                )
+            else:
+                # Non-square causal: build a (q_len, k_len) mask where row `i` attends to key
+                # positions `0..k_len - q_len + i`. Works for any cache state.
+                offset = k_len - q_len
+                rows = torch.arange(q_len, device=query_states.device).unsqueeze(1)
+                cols = torch.arange(k_len, device=query_states.device).unsqueeze(0)
+                causal_mask = cols <= rows + offset  # [q_len, k_len]
+                attn_output = F.scaled_dot_product_attention(
+                    query=query_states, key=key_states, value=value_states,
+                    attn_mask=causal_mask, is_causal=False, scale=self.scaling, enable_gqa=True,
+                )
+            attn_output = attn_output.transpose(1, 2).contiguous()
+        elif attention_mask is not None and torch.all(attention_mask):  # decoding
             query_states = query_states.transpose(1, 2)
             key_states = key_states.transpose(1, 2)
             value_states = value_states.transpose(1, 2)
         past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         # necessary, but kept here for BC
         position_embeddings: Optional[Tuple[torch.Tensor,
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
             **kwargs,
                 module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, SDARRMSNorm):
             module.weight.data.fill_(1.0)
+        # Delegate rotary-embedding buffer re-init to the base PreTrainedModel, which handles
+        # transformers v5's meta-device load by recomputing inv_freq via compute_default_rope_parameters.
+        else:
+            super()._init_weights(module)
 class SDARRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    @staticmethod
+    def compute_default_rope_parameters(config, device=None, seq_len=None):
+        # transformers v5 removed "default" from ROPE_INIT_FUNCTIONS; match the Qwen3 implementation.
+        base = getattr(config, "rope_theta", None)
+        if base is None:
+            base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, 1.0
     def __init__(self, config: SDARConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         self.original_max_seq_len = config.max_position_embeddings
         self.config = config
+        if self.rope_type == "default":
+            inv_freq, self.attention_scaling = self.compute_default_rope_parameters(config, device)
+        else:
+            self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        # Register both as buffers — transformers v5's `_move_missing_keys_from_meta_to_device`
+        # replaces non-persistent buffers with `torch.empty_like` (uninitialized / zeros); the base
+        # `_init_weights` then re-copies into them IF they're buffers with `original_inv_freq` present.
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @torch.no_grad()
     # power user: used with advanced RoPE types (e.g. dynamic rope)
 class SDARModel(SDARPreTrainedModel):
     def __init__(self, config: SDARConfig):
         super().__init__(config)
+        # transformers v5 configs may not have pad_token_id; fall back to eos_token_id.
+        self.padding_idx = getattr(config, "pad_token_id", None)
+        if self.padding_idx is None:
+            self.padding_idx = getattr(config, "eos_token_id", None)
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(
         past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
                 past_key_value=past_key_values,
                 output_attentions=output_attentions,
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
                 **flash_attn_kwargs,
         return causal_mask
+class KwargsForCausalLM(FlashAttentionKwargs):
     ...