Feature Extraction
sentence-transformers
Safetensors
Transformers
qwen3
text-generation
splade
sparse-encoder
code
custom_code
text-embeddings-inference
Instructions to use tomaarsen/naver-splade-code-8B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use tomaarsen/naver-splade-code-8B with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("tomaarsen/naver-splade-code-8B", trust_remote_code=True) sentences = [ "The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium." ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [3, 3] - Transformers
How to use tomaarsen/naver-splade-code-8B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="tomaarsen/naver-splade-code-8B", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("tomaarsen/naver-splade-code-8B", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("tomaarsen/naver-splade-code-8B", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
| """ | |
| Compared to standard Qwen3, we're using bidirectional attention and not causal attention, but it's specified | |
| with `is_causal=False` in the config. | |
| This file supports two loading paths: | |
| 1. Sentence Transformers: `SparseEncoder("naver/splade-code-8B", trust_remote_code=True)` via AutoModelForMaskedLM -> Qwen3ForCausalLM | |
| 2. Transformers: `AutoModelForCausalLM.from_pretrained("naver/splade-code-8B", trust_remote_code=True)` -> Splade | |
| The checkpoint is distributed as a LoRA adapter on top of Qwen/Qwen3-8B in the `lora/` subfolder; | |
| `Qwen3ForCausalLM.from_pretrained` loads the base model and applies the adapter. | |
| """ | |
| import os | |
| import torch | |
| from transformers import Qwen3ForCausalLM as TransformersQwen3ForCausalLM | |
| from transformers import PretrainedConfig, PreTrainedModel, AutoConfig | |
| from transformers.utils import is_flash_attn_2_available | |
| from .utils import prepare_tokenizer, splade_max, similarity, encode | |
| # The adapter lives in this subfolder rather than at the repo root so that | |
| # `find_adapter_config_file` doesn't trigger transformers' auto-PEFT path, | |
| # which would otherwise redirect hub loads to `Qwen/Qwen3-8B` and lose the | |
| # `auto_map` routing to the classes in this file. | |
| ADAPTER_SUBFOLDER = "lora" | |
| class Qwen3ForCausalLM(TransformersQwen3ForCausalLM): | |
| def tie_weights(self, *args, **kwargs): | |
| """Explicitly re-tie lm_head to embed_tokens to hopefully avoid meta tensor errors.""" | |
| if ( | |
| self.config.tie_word_embeddings | |
| and hasattr(self, "lm_head") | |
| and hasattr(self, "model") | |
| ): | |
| self.lm_head.weight = self.model.embed_tokens.weight | |
| missing_keys = kwargs.get("missing_keys") | |
| if missing_keys is not None: | |
| missing_keys.discard("lm_head.weight") | |
| else: | |
| super().tie_weights(*args, **kwargs) | |
| def _init_weights(self, module): | |
| """Skip lm_head init when it will be tied to embed_tokens later.""" | |
| if module is getattr(self, "lm_head", None) and self.config.tie_word_embeddings: | |
| return | |
| super()._init_weights(module) | |
| def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): | |
| from huggingface_hub import snapshot_download | |
| from peft import PeftConfig, PeftModel | |
| token = kwargs.get("token") | |
| # Resolve the adapter to a local path before handing it to PEFT. | |
| # PEFT's `subfolder=` kwarg uses `os.path.join` on Windows, producing | |
| # backslashed hub paths that break the safetensors-vs-bin fallback. | |
| if os.path.isdir(pretrained_model_name_or_path): | |
| adapter_path = os.path.join(pretrained_model_name_or_path, ADAPTER_SUBFOLDER) | |
| else: | |
| local_repo = snapshot_download( | |
| pretrained_model_name_or_path, | |
| allow_patterns=[f"{ADAPTER_SUBFOLDER}/*"], | |
| token=token, | |
| ) | |
| adapter_path = os.path.join(local_repo, ADAPTER_SUBFOLDER) | |
| if not os.path.isfile(os.path.join(adapter_path, "adapter_config.json")): | |
| return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) | |
| peft_config = PeftConfig.from_pretrained(adapter_path, token=token) | |
| # Use provided splade config (has is_causal=False) or load it from the adapter repo | |
| config = kwargs.pop("config", None) | |
| if config is None or not isinstance(config, PretrainedConfig): | |
| config = AutoConfig.from_pretrained(pretrained_model_name_or_path, token=token) | |
| base_model = super().from_pretrained( | |
| peft_config.base_model_name_or_path, | |
| *model_args, | |
| config=config, | |
| **kwargs, | |
| ) | |
| return PeftModel.from_pretrained(base_model, adapter_path, token=token) | |
| class SpladeConfig(PretrainedConfig): | |
| model_type = "qwen3" | |
| def __init__( | |
| self, | |
| model_name_or_path: str = "Qwen/Qwen3-8B", | |
| attn_implementation: str = "flash_attention_2", | |
| bidirectional: bool = True, # only for decoder models | |
| padding_side: str = "left", | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.model_name_or_path = model_name_or_path | |
| self.attn_implementation = attn_implementation | |
| self.bidirectional = bidirectional | |
| self.padding_side = padding_side | |
| class Splade(PreTrainedModel): | |
| config_class = SpladeConfig | |
| # methods for MTEB's interface | |
| similarity = similarity | |
| encode = encode | |
| def __init__(self, config, weights_path=None, token=None): | |
| super().__init__(config) | |
| self.name = "splade" | |
| base_cfg = AutoConfig.from_pretrained( | |
| weights_path, | |
| attn_implementation=config.attn_implementation, | |
| torch_dtype="auto", | |
| token=token, | |
| ) | |
| self.tokenizer = prepare_tokenizer( | |
| weights_path, padding_side=config.padding_side | |
| ) | |
| if is_flash_attn_2_available(): | |
| config.attn_implementation = "flash_attention_2" | |
| else: | |
| config.attn_implementation = "sdpa" | |
| self.model = Qwen3ForCausalLM.from_pretrained( | |
| weights_path, | |
| config=base_cfg, | |
| torch_dtype=torch.bfloat16, | |
| attn_implementation=config.attn_implementation, | |
| token=token, | |
| ) | |
| def save_pretrained(self, save_directory, *args, **kwargs): | |
| self.model.save_pretrained(os.path.join(save_directory, ADAPTER_SUBFOLDER)) | |
| self.config.save_pretrained(save_directory) | |
| def from_pretrained(cls, model_name_or_path, *args, **kwargs): | |
| token = kwargs.get("token", None) | |
| config = SpladeConfig.from_pretrained( | |
| model_name_or_path, | |
| token=token, | |
| ) | |
| model = cls(config, weights_path=model_name_or_path, token=token) | |
| model.reverse_voc = {v: k for k, v in model.tokenizer.vocab.items()} | |
| return model | |
| def forward(self, **tokens): | |
| output = self.model(**tokens) | |
| splade_reps, _ = splade_max(output.logits, tokens["attention_mask"]) | |
| return (splade_reps,) | |
| def get_width(self): | |
| return self.model.config.vocab_size | |
| def create_batch_dict(self, input_texts, max_length): | |
| return self.tokenizer( | |
| input_texts, | |
| add_special_tokens=True, | |
| padding="longest", | |
| truncation=True, | |
| max_length=max_length, | |
| return_attention_mask=True, | |
| return_tensors="pt", | |
| ) | |
| __all__ = ["Qwen3ForCausalLM", "Splade"] | |