Instructions to use benjamin/Qwen3-4B-Base-flax with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use benjamin/Qwen3-4B-Base-flax with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="benjamin/Qwen3-4B-Base-flax", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("benjamin/Qwen3-4B-Base-flax", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use benjamin/Qwen3-4B-Base-flax with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "benjamin/Qwen3-4B-Base-flax"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "benjamin/Qwen3-4B-Base-flax",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/benjamin/Qwen3-4B-Base-flax

SGLang

How to use benjamin/Qwen3-4B-Base-flax with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "benjamin/Qwen3-4B-Base-flax" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "benjamin/Qwen3-4B-Base-flax",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "benjamin/Qwen3-4B-Base-flax" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "benjamin/Qwen3-4B-Base-flax",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use benjamin/Qwen3-4B-Base-flax with Docker Model Runner:
```
docker model run hf.co/benjamin/Qwen3-4B-Base-flax
```

benjamin commited on May 27, 2025

Commit

d2bb4be

verified ·

1 Parent(s): 66763b8

Upload FlaxTPULlamaForCausalLM

Browse files

Files changed (3) hide show

config.json +3 -2
configuration_tpu_llama.py +3 -0
modelling_flax_tpu_llama.py +34 -5

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "TPULlamaForCausalLM"
   ],
@@ -36,7 +37,7 @@
   "torch_dtype": "float32",
   "transformers_version": "4.52.3",
   "use_cache": true,
   "use_sliding_window": false,
-  "vocab_size": 151936,
-  "add_qk_norm": true
 }

 {
+  "add_qk_norm": false,
   "architectures": [
     "TPULlamaForCausalLM"
   ],
   "torch_dtype": "float32",
   "transformers_version": "4.52.3",
   "use_cache": true,
+  "use_qk_norm": true,
   "use_sliding_window": false,
+  "vocab_size": 151936
 }

configuration_tpu_llama.py CHANGED Viewed

@@ -147,6 +147,7 @@ class TPULlamaConfig(PretrainedConfig):
         attention_dropout=0.0,
         mlp_bias=False,
         head_dim=None,
         expand_input_ids=False, # Transformers-native PyTorch generation support
         expand_input_ids_maxlen=None,
         expand_input_ids_vocab_size=None,
@@ -183,6 +184,8 @@ class TPULlamaConfig(PretrainedConfig):
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
         self.expand_input_ids = expand_input_ids
         self.expand_input_ids_maxlen = expand_input_ids_maxlen
         self.expand_input_ids_vocab_size = expand_input_ids_vocab_size

         attention_dropout=0.0,
         mlp_bias=False,
         head_dim=None,
+        add_qk_norm=False, # Qwen3 compatibility
         expand_input_ids=False, # Transformers-native PyTorch generation support
         expand_input_ids_maxlen=None,
         expand_input_ids_vocab_size=None,
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
+        self.add_qk_norm = add_qk_norm  # Qwen3 compatibility
         self.expand_input_ids = expand_input_ids
         self.expand_input_ids_maxlen = expand_input_ids_maxlen
         self.expand_input_ids_vocab_size = expand_input_ids_vocab_size

modelling_flax_tpu_llama.py CHANGED Viewed

@@ -273,10 +273,16 @@ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
 class FlaxTPULlamaRMSNorm(nn.Module):
     config: TPULlamaConfig
     dtype: jnp.dtype = jnp.float32
     def setup(self):
         self.epsilon = self.config.rms_norm_eps
-        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
     def __call__(self, hidden_states):
         variance = jnp.asarray(hidden_states, dtype=jnp.float32)
@@ -350,6 +356,11 @@ class FlaxTPULlamaAttention(nn.Module):
         self.k_proj = dense(self.num_key_value_heads * self.head_dim)
         self.v_proj = dense(self.num_key_value_heads * self.head_dim)
         self.o_proj = dense(self.embed_dim)
         self.causal_mask = make_causal_mask(
             jnp.ones(
                 (1, getattr(config, "max_length", config.max_position_embeddings)),
@@ -357,7 +368,6 @@ class FlaxTPULlamaAttention(nn.Module):
             ),
             dtype="bool",
         )
-        self.rotary_emb = FlaxTPULlamaRotaryEmbedding(config, dtype=self.dtype)
     def _split_heads(self, hidden_states, num_heads):
         return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
@@ -401,6 +411,7 @@ class FlaxTPULlamaAttention(nn.Module):
     def __call__(
         self,
         hidden_states,
         attention_mask,
         position_ids,
         deterministic: bool = True,
@@ -415,9 +426,19 @@ class FlaxTPULlamaAttention(nn.Module):
         key = self._split_heads(raw_key, self.num_key_value_heads)
         value = self._split_heads(raw_value, self.num_key_value_heads)
-        cos, sin = self.rotary_emb(value, position_ids)
         query, key = apply_rotary_pos_emb(query, key, cos, sin)
         query_length, key_length = query.shape[1], key.shape[1]
         if self.has_variable("cache", "cached_key"):
@@ -519,6 +540,7 @@ class FlaxTPULlamaFlashAttention(FlaxTPULlamaAttention):
     def __call__(
         self,
         hidden_states,
         attention_mask,
         position_ids,
         deterministic: bool = True,
@@ -533,7 +555,7 @@ class FlaxTPULlamaFlashAttention(FlaxTPULlamaAttention):
         key = self._split_heads(raw_key, self.num_key_value_heads)
         value = self._split_heads(raw_value, self.num_key_value_heads)
-        cos, sin = self.rotary_emb(value, position_ids)
         query, key = apply_rotary_pos_emb(query, key, cos, sin)
         query_length, key_length = query.shape[1], key.shape[1]
@@ -647,6 +669,7 @@ class FlaxTPULlamaDecoderLayer(nn.Module):
     def __call__(
         self,
         hidden_states,
         attention_mask=None,
         position_ids=None,
         deterministic: bool = True,
@@ -660,6 +683,7 @@ class FlaxTPULlamaDecoderLayer(nn.Module):
         hidden_states = self.input_layernorm(hidden_states)
         outputs = self.self_attn(
             hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
             deterministic=deterministic,
@@ -865,8 +889,10 @@ class FlaxTPULlamaLayerCollection(nn.Module):
     gradient_checkpointing: bool = False
     def setup(self):
         if self.gradient_checkpointing:
-            FlaxTPULlamaDecoderCheckpointLayer = remat(FlaxTPULlamaDecoderLayer, static_argnums=(3, 4, 5))
             self.blocks = [
                 FlaxTPULlamaDecoderCheckpointLayer(self.config, dtype=self.dtype, name=str(i))
                 for i in range(self.config.num_hidden_layers)
@@ -891,6 +917,8 @@ class FlaxTPULlamaLayerCollection(nn.Module):
         all_attentions = () if output_attentions else None
         all_hidden_states = [(), ()] if output_hidden_states else None
         if output_hidden_states:
             all_hidden_states[0] += (hidden_states,)
             all_hidden_states[1] += (hidden_states,)
@@ -898,6 +926,7 @@ class FlaxTPULlamaLayerCollection(nn.Module):
         for block_idx, block in enumerate(self.blocks):
             layer_outputs = block(
                 hidden_states,
                 attention_mask,
                 position_ids,
                 deterministic,

 class FlaxTPULlamaRMSNorm(nn.Module):
     config: TPULlamaConfig
     dtype: jnp.dtype = jnp.float32
+    override_dim: int = None
     def setup(self):
+        if self.override_dim is not None:
+            dim = self.override_dim
+        else:
+            dim = self.config.hidden_size
         self.epsilon = self.config.rms_norm_eps
+        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), dim)
     def __call__(self, hidden_states):
         variance = jnp.asarray(hidden_states, dtype=jnp.float32)
         self.k_proj = dense(self.num_key_value_heads * self.head_dim)
         self.v_proj = dense(self.num_key_value_heads * self.head_dim)
         self.o_proj = dense(self.embed_dim)
+        if self.config.add_qk_norm:
+            self.q_norm = FlaxTPULlamaRMSNorm(self.config, dtype=self.dtype, override_dim=self.head_dim)
+            self.k_norm = FlaxTPULlamaRMSNorm(self.config, dtype=self.dtype, override_dim=self.head_dim)
         self.causal_mask = make_causal_mask(
             jnp.ones(
                 (1, getattr(config, "max_length", config.max_position_embeddings)),
             ),
             dtype="bool",
         )
     def _split_heads(self, hidden_states, num_heads):
         return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
     def __call__(
         self,
         hidden_states,
+        position_embeddings,
         attention_mask,
         position_ids,
         deterministic: bool = True,
         key = self._split_heads(raw_key, self.num_key_value_heads)
         value = self._split_heads(raw_value, self.num_key_value_heads)
+        if self.config.add_qk_norm:
+            query = self.q_norm(query)
+            key = self.k_norm(key)
+        print(query.sum(), key.sum(), value.sum())
+        cos, sin = position_embeddings
         query, key = apply_rotary_pos_emb(query, key, cos, sin)
+        print(query.sum(), key.sum())
+        print()
+        print()
         query_length, key_length = query.shape[1], key.shape[1]
         if self.has_variable("cache", "cached_key"):
     def __call__(
         self,
         hidden_states,
+        position_embeddings,
         attention_mask,
         position_ids,
         deterministic: bool = True,
         key = self._split_heads(raw_key, self.num_key_value_heads)
         value = self._split_heads(raw_value, self.num_key_value_heads)
+        cos, sin = position_embeddings
         query, key = apply_rotary_pos_emb(query, key, cos, sin)
         query_length, key_length = query.shape[1], key.shape[1]
     def __call__(
         self,
         hidden_states,
+        position_embeddings,
         attention_mask=None,
         position_ids=None,
         deterministic: bool = True,
         hidden_states = self.input_layernorm(hidden_states)
         outputs = self.self_attn(
             hidden_states,
+            position_embeddings,
             attention_mask=attention_mask,
             position_ids=position_ids,
             deterministic=deterministic,
     gradient_checkpointing: bool = False
     def setup(self):
+        self.rotary_emb = FlaxTPULlamaRotaryEmbedding(self.config, dtype=self.dtype)
         if self.gradient_checkpointing:
+            FlaxTPULlamaDecoderCheckpointLayer = remat(FlaxTPULlamaDecoderLayer, static_argnums=(4, 5, 6))
             self.blocks = [
                 FlaxTPULlamaDecoderCheckpointLayer(self.config, dtype=self.dtype, name=str(i))
                 for i in range(self.config.num_hidden_layers)
         all_attentions = () if output_attentions else None
         all_hidden_states = [(), ()] if output_hidden_states else None
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
         if output_hidden_states:
             all_hidden_states[0] += (hidden_states,)
             all_hidden_states[1] += (hidden_states,)
         for block_idx, block in enumerate(self.blocks):
             layer_outputs = block(
                 hidden_states,
+                position_embeddings,
                 attention_mask,
                 position_ids,
                 deterministic,