Fix model_type/config compatibility for MLX + SentenceTransformers

Browse files

Files changed (14) hide show

.gitattributes +1 -0
1_Pooling/config.json +10 -0
README.md +188 -4
added_tokens.json +28 -0
assets/diag.png +3 -0
assets/logo.svg +593 -0
config.json +69 -69
configuration.py +1 -1
merges.txt +0 -0
model.safetensors +2 -2
modules.json +21 -0
special_tokens_map.json +45 -0
tokenizer_config.json +236 -3
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/diag.png filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "word_embedding_dimension": 1024,
+    "pooling_mode_cls_token": false,
+    "pooling_mode_mean_tokens": true,
+    "pooling_mode_max_tokens": false,
+    "pooling_mode_mean_sqrt_len_tokens": false,
+    "pooling_mode_weightedmean_tokens": false,
+    "pooling_mode_lasttoken": false,
+    "include_prompt": true
+}

README.md CHANGED Viewed

@@ -1,7 +1,191 @@
 ---
-language: en
-pipeline_tag: text-generation
-library_name: mlx
 tags:
-- mlx
 ---

 ---
+license: mit
+pipeline_tag: feature-extraction
 tags:
+- feature-extraction
+- sentence-similarity
+- mteb
+- sentence-transformers
+language:
+- multilingual
 ---
+<p align="center">
+  <img src="assets/logo.svg" alt="Perplexity Logo" width="400">
+</p>
+<p align="center">pplx-embed-v1: Diffusion-Pretrained Dense and Contextual Embeddings</p>
+`pplx-embed-v1` and `pplx-embed-context-v1` are state-of-the-art text embedding models optimized for real-world, web-scale retrieval tasks.
+- Use **`pplx-embed-v1`** for independent text embedding (queries, documents, semantic search)
+- Use **`pplx-embed-context-v1`** for document chunks in RAG systems where surrounding context matters
+> [!IMPORTANT]
+> `pplx-embed-v1` and `pplx-embed-context-v1` natively produce *unnormalized* int8-quantized embeddings. Ensure that you compare them via *cosine similarity*.
+![diag.png](assets/diag.png)
+## Models
+| Model | Dimensions | Context | MRL | Quantization | Instruction | Pooling |
+|:-----:|:----------:|:-------:|:---:|:------------:|:-----------:|:-------:|
+| `pplx-embed-v1-0.6B` | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
+| `pplx-embed-v1-4B` | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
+| `pplx-embed-context-v1-0.6B` | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
+| `pplx-embed-context-v1-4B` | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
+<sub>All models are built on diffusion continued pre-trained Qwen3 at Perplexity AI.</sub>
+<sub>Many modern embedding models rely on instruction tuning, where users prepend an instruction string to the text being embedded. This can yield a 2%-3% lift on benchmarks, but it also introduces prompt-selection overhead and can make indexing pipelines brittle (small instruction changes can shift embedding space). We deliberately **avoid** this requirement: you can embed the text you want to index directly, without having to choose or maintain an instruction prefix.</sub>
+## Usage
+<details>
+<summary>Via API</summary>
+```bash
+curl -X POST https://api.perplexity.ai/v1/embeddings \
+  -H "Authorization: Bearer YOUR_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": [
+      "Scientists explore the universe driven by curiosity.",
+      "Children learn through curious exploration.",
+      "Historical discoveries began with curious questions.",
+      "Animals use curiosity to adapt and survive.",
+      "Philosophy examines the nature of curiosity."
+    ],
+    "model": "pplx-embed-v1-0.6b"
+  }'
+```
+</details>
+<details>
+<summary>Using SentenceTransformers</summary>
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer(
+    "perplexity-ai/pplx-embed-v1-0.6B",
+    trust_remote_code=True
+)
+texts = [
+    "Scientists explore the universe driven by curiosity.",
+    "Children learn through curious exploration.",
+    "Historical discoveries began with curious questions.",
+    "Animals use curiosity to adapt and survive.",
+    "Philosophy examines the nature of curiosity.",
+]
+embeddings = model.encode(texts) # Shape: (5, 1024), quantized to int8
+embeddings = model.encode(texts, quantization="binary") # Shape: (5, 1024), quantized to binary
+```
+</details>
+<details>
+<summary> Using ONNX models </summary>
+```python
+import onnxruntime as ort
+from transformers import AutoTokenizer
+import numpy as np
+tokenizer = AutoTokenizer.from_pretrained("perplexity-ai/pplx-embed-v1-0.6b", trust_remote_code=True)
+session = ort.InferenceSession("onnx/model.onnx")
+texts = [
+    "Scientists explore the universe driven by curiosity.",
+    "Children learn through curious exploration.",
+    "Historical discoveries began with curious questions.",
+    "Animals use curiosity to adapt and survive.",
+    "Philosophy examines the nature of curiosity.",
+]
+tokenized = tokenizer(
+    texts,
+    padding=True,
+    truncation=True,
+    return_tensors="np"
+)
+onnx_inputs = {
+    "input_ids": tokenized["input_ids"].astype(np.int64),
+    "attention_mask": tokenized["attention_mask"].astype(np.int64),
+}
+# Run inference
+onnx_embeddings = session.run([out.name for out in session.get_outputs()], onnx_inputs)
+# ONNX produces both int8 and binary precision embeddings:
+int8_embeddings = onnx_embeddings[2]
+binary_embeddings = onnx_embeddings[3]
+packed_embeddings = np.packbits(binary_embeddings != -1, axis=-1)
+```
+</details>
+<details>
+<summary>Using Text Embeddings Inference (TEI)</summary>
+> [!NOTE]
+> Text Embeddings Inference v1.9.2+ is required.
+> [!IMPORTANT]
+> Currently, only int8-quantized embeddings are available via TEI. Remember to use cosine similarity with unnormalized int8 embeddings.
+- CPU w/ Candle:
+```bash
+docker run -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.9 --model-id perplexity-ai/pplx-embed-v1-0.6B --dtype float32
+```
+- CPU w/ ORT (ONNX Runtime):
+```bash
+docker run -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.9 --model-id onnx-community/pplx-embed-v1-0.6B --dtype float32
+```
+- GPU w/ CUDA:
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cuda-1.9 --model-id perplexity-ai/pplx-embed-v1-0.6B --dtype float32
+```
+> If you hit OOM during warmup, lower --max-batch-tokens and --max-client-batch-size. Set --max-batch-tokens to max_sequence_length × batch_size (e.g., 2048 tokens × 8 sequences = 16384).
+> Alternatively, when running in CUDA you can use the architecture / compute capability specific
+> container instead of the `cuda-1.9`, as that includes the binaries for Turing, Ampere, Hopper and
+> Blackwell, so using a dedicated container will be lighter e.g., `ampere-1.9`.
+And then you can send requests to it via cURL to `/embed`:
+```bash
+curl http://0.0.0.0:8080/embed \
+  -H "Content-Type: application/json" \
+  -d '{
+    "inputs": [
+      "Scientists explore the universe driven by curiosity.",
+      "Children learn through curious exploration.",
+      "Historical discoveries began with curious questions.",
+      "Animals use curiosity to adapt and survive.",
+      "Philosophy examines the nature of curiosity."
+    ],
+    "normalize": false
+  }'
+```
+</details>
+## Technical Details
+For comprehensive technical details and evaluation results, see our paper on arXiv: https://arxiv.org/abs/2602.11151.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

assets/diag.png ADDED Viewed

Git LFS Details

SHA256: 63a6e460d1c0aab08674a944c2778403077c4ef27ccbe3a046057dfb4984d2ff
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

assets/logo.svg ADDED Viewed

config.json CHANGED Viewed

@@ -1,71 +1,71 @@
 {
-    "architectures": [
-        "PPLXQwen3Model"
-    ],
-    "attention_bias": false,
-    "attention_dropout": 0.0,
-    "attn_implementation": "sdpa",
-    "auto_map": {
-        "AutoConfig": "configuration.PPLXQwen3Config",
-        "AutoModel": "modeling.PPLXQwen3Model"
-    },
-    "bos_token_id": 151643,
-    "dtype": "float32",
-    "eos_token_id": 151643,
-    "head_dim": 128,
-    "hidden_act": "silu",
-    "hidden_size": 1024,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "layer_types": [
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention",
-        "full_attention"
-    ],
-    "max_position_embeddings": 32768,
-    "max_window_layers": 28,
-    "model_file": "mlx_pplx_qwen3.py",
-    "model_type": "bidirectional_pplx_qwen3",
-    "num_attention_heads": 16,
-    "num_hidden_layers": 28,
-    "num_key_value_heads": 8,
-    "rms_norm_eps": 1e-06,
-    "rope_parameters": {
-        "rope_theta": 1000000,
-        "rope_type": "default"
-    },
     "rope_theta": 1000000,
-    "sliding_window": null,
-    "source_model_type": "bidirectional_pplx_qwen3",
-    "tie_word_embeddings": true,
-    "transformers_version": "5.0.0.dev0",
-    "use_bidirectional_attention": true,
-    "use_cache": false,
-    "use_sliding_window": false,
-    "vocab_size": 151936
-}

 {
+  "architectures": [
+    "PPLXQwen3Model"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration.PPLXQwen3Config",
+    "AutoModel": "modeling.PPLXQwen3Model"
+  },
+  "bos_token_id": 151643,
+  "dtype": "float32",
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
     "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0.dev0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "attn_implementation": "sdpa",
+  "use_bidirectional_attention": true,
+  "source_model_type": "bidirectional_pplx_qwen3",
+  "model_file": "mlx_pplx_qwen3.py"
+}

configuration.py CHANGED Viewed

@@ -2,4 +2,4 @@ from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
 class PPLXQwen3Config(Qwen3Config):
-    model_type = "bidirectional_pplx_qwen3"


2
3
4	class PPLXQwen3Config(Qwen3Config):
5	+ model_type = "qwen3"

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d5733cb2cd3b0351ab37e22c7b2b894bab3a8a90a44fcedc5208068ac58fb65
-size 1192132841

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c8d2f64f8268ccd5383b7f9bea8e660349aa6a151bd68a5a47f4c129f2a4974
+size 2384233112

modules.json ADDED Viewed

	@@ -0,0 +1,21 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "",
+    "type": "st_quantize.FlexibleQuantizer",
+    "kwargs": ["quantization"]
+  }
+]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "â½Ĺ",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -1,16 +1,249 @@
 {
   "add_prefix_space": false,
-  "backend": "tokenizers",
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "errors": "replace",
-  "is_local": true,
   "mask_token": "â½Ĺ",
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "sep_token": "<|endoftext|>",
   "split_special_tokens": false,
-  "tokenizer_class": "TokenizersBackend",
   "unk_token": null
 }

 {
+  "add_bos_token": false,
   "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151642": {
+      "content": "â½Ĺ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "errors": "replace",
+  "extra_special_tokens": {},
   "mask_token": "â½Ĺ",
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "sep_token": "<|endoftext|>",
   "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null
 }

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff