Instructions to use Johnblick187/SmartCoderMoE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Johnblick187/SmartCoderMoE with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Johnblick187/SmartCoderMoE", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Johnblick187/SmartCoderMoE", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use Johnblick187/SmartCoderMoE with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Johnblick187/SmartCoderMoE"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Johnblick187/SmartCoderMoE

SGLang

How to use Johnblick187/SmartCoderMoE with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Johnblick187/SmartCoderMoE" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Johnblick187/SmartCoderMoE" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Johnblick187/SmartCoderMoE with Docker Model Runner:
```
docker model run hf.co/Johnblick187/SmartCoderMoE
```

Johnblick187 commited on 5 days ago

Commit

9eb4e00

verified ·

1 Parent(s): c8a1ec0

Upload modeling_smartcoder_moe.py

Browse files

Files changed (1) hide show

modeling_smartcoder_moe.py +27 -20

modeling_smartcoder_moe.py CHANGED Viewed

@@ -168,6 +168,29 @@ class SmartCoderMoEMLP(nn.Module):
         self.experts_proj = nn.Parameter(torch.empty(NE, H, EI))
         self.router      = nn.Linear(H, NE, bias=False)
     def forward(self, x):
         B, T, H = x.shape
@@ -254,14 +277,6 @@ class SmartCoderMoEForCausalLM(PreTrainedModel, GenerationMixin):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        remapped = {}
-        for k, v in state_dict.items():
-            k = k.replace('experts_fc.weight', 'experts_fc')
-            k = k.replace('experts_proj.weight', 'experts_proj')
-            remapped[k] = v
-        super()._load_from_state_dict(remapped, prefix, *args, **kwargs)
     def get_input_embeddings(self): return self.model.embed_tokens
     def get_output_embeddings(self): return self.lm_head
@@ -328,17 +343,9 @@ def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloa
     for f in sf_files:
         state_dict.update(load_file(str(f)))
-    # Remap expert keys — safetensors has .weight suffix, our params don't
-    remapped = {}
-    for k, v in state_dict.items():
-        if 'experts_fc.weight' in k:
-            remapped[k.replace('experts_fc.weight', 'experts_fc')] = v
-        elif 'experts_proj.weight' in k:
-            remapped[k.replace('experts_proj.weight', 'experts_proj')] = v
-        else:
-            remapped[k] = v
-    state_dict = remapped
     missing, unexpected = model.load_state_dict(state_dict, strict=False)
     if missing:
         print(f"Missing: {missing[:3]}{'...' if len(missing)>3 else ''}")
@@ -351,4 +358,4 @@ def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloa
 from transformers import AutoConfig, AutoModelForCausalLM
 AutoConfig.register("smartcoder_moe", SmartCoderMoEConfig)
-AutoModelForCausalLM.register(SmartCoderMoEConfig, SmartCoderMoEForCausalLM)

         self.experts_proj = nn.Parameter(torch.empty(NE, H, EI))
         self.router      = nn.Linear(H, NE, bias=False)
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        # Checkpoint stores expert weights with a '.weight' suffix (as if
+        # experts_fc/experts_proj were nn.Linear submodules), but they're
+        # raw nn.Parameter tensors here (no '.weight' child -- needed for
+        # batched bmm across all experts at once, see forward() below).
+        # PyTorch's load_state_dict() recursion calls _load_from_state_dict
+        # on EVERY submodule in the tree directly (using each module's own
+        # class method, not a parent class's override) -- so the remap has
+        # to live on THIS class, not on SmartCoderMoEForCausalLM. The
+        # previous override sat on the top-level CausalLM class and only
+        # ever fired for its own direct params/buffers (it has none), never
+        # for this module's recursive call -- silently skipping every
+        # expert tensor. That's the actual bug.
+        remapped = {}
+        for k, v in state_dict.items():
+            if k == prefix + "experts_fc.weight":
+                remapped[prefix + "experts_fc"] = v
+            elif k == prefix + "experts_proj.weight":
+                remapped[prefix + "experts_proj"] = v
+            else:
+                remapped[k] = v
+        super()._load_from_state_dict(remapped, prefix, *args, **kwargs)
     def forward(self, x):
         B, T, H = x.shape
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     def get_input_embeddings(self): return self.model.embed_tokens
     def get_output_embeddings(self): return self.lm_head
     for f in sf_files:
         state_dict.update(load_file(str(f)))
+    # Expert key remap (.weight suffix in the checkpoint vs raw Parameter
+    # here) is now handled by SmartCoderMoEMLP._load_from_state_dict
+    # itself, so load_state_dict() needs no manual remapping here anymore.
     missing, unexpected = model.load_state_dict(state_dict, strict=False)
     if missing:
         print(f"Missing: {missing[:3]}{'...' if len(missing)>3 else ''}")
 from transformers import AutoConfig, AutoModelForCausalLM
 AutoConfig.register("smartcoder_moe", SmartCoderMoEConfig)
+AutoModelForCausalLM.register(SmartCoderMoEConfig, SmartCoderMoEForCausalLM)