vllm==0.18.0 RuntimeError: Failed to load the tokenizer. If the tokenizer is a custom tokenizer not yet available in the HuggingFace transformers library, consider setting `trust_remote_code=True`
CUDA_VISIBLE_DEVICES=0 vllm serve lainlives/Qwen3.5-9B-bnb-4bit --kv-cache-dtype fp8 --mm-processor-cache-gb 0 --limit-mm-per-prompt.video 0 --limit-mm-per-prompt.image 1 -dp 1 --port 10014 --gpu-memory-utilization 0.976 --served-model-name qwen3.5-9b --dtype half --max_num_seqs 4 --max-model-len 8192 --mamba-cache-dtype float16 --mamba-ssm-cache-dtype float16 --kv-cache-memory-bytes 100M --async-scheduling --default-chat-template-kwargs '{"enable_thinking": false}'
It should work with Qwen3_5Model just fine, I can try reuploading it later.
CUDA_VISIBLE_DEVICES=0 vllm serve lainlives/Qwen3.5-9B-bnb-4bit --kv-cache-dtype fp8 --mm-processor-cache-gb 0 --limit-mm-per-prompt.video 0 --limit-mm-per-prompt.image 1 -dp 1 --port 10014 --gpu-memory-utilization 0.976 --served-model-name qwen3.5-9b --dtype half --max_num_seqs 4 --max-model-len 8192 --mamba-cache-dtype float16 --mamba-ssm-cache-dtype float16 --kv-cache-memory-bytes 100M --async-scheduling --default-chat-template-kwargs '{"enable_thinking": false}'
Sorry i forgot, but i forgot that a function had to be wrapped heres the wrapper i used to get bnb to load it.
import contextlib
import torch
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
class AbliteratedQwen3_5ForCausalLM(Qwen3_5ForConditionalGeneration):
def __init__(self, config):
text_cfg = getattr(config, "text_config", None)
if text_cfg is not None:
for key, value in text_cfg.to_dict().items():
if not hasattr(config, key):
setattr(config, key, value)
super().__init__(config)
direction = getattr(config, "refusal_direction", None)
if direction is None:
raise ValueError(
"Missing `refusal_direction` in config. "
"This checkpoint expects an embedded refusal direction."
)
direction_t = torch.tensor(direction, dtype=torch.float32)
self.register_buffer("_refusal_direction", direction_t, persistent=False)
self.ablation_layers = [int(x) for x in getattr(config, "ablation_layers", [])]
self.ablation_strength = float(getattr(config, "ablation_strength", 1.0))
def _decoder_layers(self):
if hasattr(self.model, "language_model") and hasattr(self.model.language_model, "layers"):
return self.model.language_model.layers
if hasattr(self.model, "layers"):
return self.model.layers
raise AttributeError("Could not find decoder layers in model.")
def _make_hook(self):
def hook_fn(module, inputs):
hidden = inputs[0]
d = self._refusal_direction.to(hidden.device, hidden.dtype)
d = d / (d.norm() + 1e-8)
projection = torch.einsum("bsd,d->bs", hidden, d).unsqueeze(-1) * d
hidden = hidden - float(self.ablation_strength) * projection
return (hidden, *inputs[1:])
return hook_fn
@contextlib.contextmanager
def _temporary_hooks(self):
layers = self._decoder_layers()
layer_ids = [i for i in self.ablation_layers if 0 <= i < len(layers)]
if not layer_ids or self.ablation_strength <= 0:
yield
return
handle_list = []
try:
hook_fn = self._make_hook()
for layer_id in layer_ids:
handle_list.append(layers[layer_id].register_forward_pre_hook(hook_fn))
yield
finally:
for h in handle_list:
h.remove()
def forward(self, *args, **kwargs):
self.ablation_strength = float(getattr(self.config, "ablation_strength", self.ablation_strength))
self.ablation_layers = [int(x) for x in getattr(self.config, "ablation_layers", self.ablation_layers)]
with self._temporary_hooks():
return super().forward(*args, **kwargs)
It's worth mentioning depending whats loading it layers might be missing. You will have to work around that yourself. Ask gemini or whatever its pretty good at that.