perplexity-ai
/

pplx-embed-context-v1-0.6b

@@ -1,5 +1,5 @@
 ---
-license: mit
 pipeline_tag: feature-extraction
 tags:
 - feature-extraction
@@ -7,8 +7,7 @@ tags:
 - conteb
 - contextual-embeddings
 language:
-- multilingual
-library_name: transformers
 ---
@@ -16,7 +15,7 @@ library_name: transformers
   <img src="assets/logo.svg" alt="Perplexity Logo" width="400">
 </p>
-<p align="center">pplx-embed-v1: Diffusion-Pretrained Dense and Contextual Embeddings</p>
 `pplx-embed-v1` and `pplx-embed-context-v1` are state-of-the-art text embedding models optimized for real-world, web-scale retrieval tasks.
@@ -52,7 +51,7 @@ curl -X POST https://api.perplexity.ai/v1/contextualizedembeddings \
   -H "Authorization: Bearer YOUR_API_KEY" \
   -H "Content-Type: application/json" \
   -d '{
-    "input": [
       [
         "Curiosity begins in childhood with endless questions about the world.",
         "As we grow, curiosity drives us to explore new ideas and challenge assumptions.",
@@ -63,7 +62,7 @@ curl -X POST https://api.perplexity.ai/v1/contextualizedembeddings \
         "Each discovery on Mars sparks new questions about our place in the universe."
       ]
     ],
-    "model": "pplx-embed-context-v1-0.6b"
   }'
 ```
@@ -254,14 +253,15 @@ batch_chunk_embeddings = [
 int8_embeddings = [quantize_int8_tanh(x) for x in batch_chunk_embeddings]
 binary_embeddings = [quantize_binary(x) for x in batch_chunk_embeddings]
-bits = [np.where(doc.numpy() >=  0, True, False) for doc in binary_embeddings]
-packed_embeddings = [np.packbits(b, axis=-1) for b in bits]
 ```
 </details>
 ## Technical Details
-For comprehensive technical details and evaluation results, see our paper on arXiv: https://arxiv.org/abs/2602.11151.

 ---
+license: apache-2.0
 pipeline_tag: feature-extraction
 tags:
 - feature-extraction
 - conteb
 - contextual-embeddings
 language:
+  - multilingual
 ---
   <img src="assets/logo.svg" alt="Perplexity Logo" width="400">
 </p>
+<p align="center">pplx-embed-v1: Diffusion-LM for Dense and Contextual Retrieval</p>
 `pplx-embed-v1` and `pplx-embed-context-v1` are state-of-the-art text embedding models optimized for real-world, web-scale retrieval tasks.
   -H "Authorization: Bearer YOUR_API_KEY" \
   -H "Content-Type: application/json" \
   -d '{
+    "inputs": [
       [
         "Curiosity begins in childhood with endless questions about the world.",
         "As we grow, curiosity drives us to explore new ideas and challenge assumptions.",
         "Each discovery on Mars sparks new questions about our place in the universe."
       ]
     ],
+    "model": "pplx-embed-context-v1-0.6B"
   }'
 ```
 int8_embeddings = [quantize_int8_tanh(x) for x in batch_chunk_embeddings]
 binary_embeddings = [quantize_binary(x) for x in batch_chunk_embeddings]
 ```
 </details>
 ## Technical Details
+For comprehensive technical details and evaluation results, see our paper on arXiv.
+## Contact
+- Website: https://perplexity.ai
+- API Support: api-support@perplexity.ai

modeling.py CHANGED Viewed

@@ -142,7 +142,7 @@ class PPLXQwen3ContextualModel(PPLXQwen3Model):
         device: str | torch.device | None = None,
         normalize_embeddings: bool = False,
         convert_to_numpy: bool = True,
-        quantization: Literal["int8", "binary", "ubinary"] = "int8",
     ) -> list[np.ndarray] | list[torch.Tensor]:
         """
         Encode documents with late chunking (contextual embeddings).
@@ -168,17 +168,15 @@ class PPLXQwen3ContextualModel(PPLXQwen3Model):
             convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
             quantization: Quantization type to apply. Options:
                 - "int8": Int8 tanh quantization (default)
-                - "binary": Binary tanh quantization (-1.0 or 1.0)
-                - "ubinary": Unsigned packed binary (uint8, 8x compression)
         Returns:
             List of numpy arrays or tensors (preserves document structure).
-            Each element has shape (n_chunks, hidden_dim) or (n_chunks, hidden_dim // 8) for ubinary.
-            Example: embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
             Output type depends on quantization method:
-            - "int8": int8 dtype, values in range [-128, 127], shape (..., hidden_dim)
-            - "binary": float32 dtype, values -1.0 or 1.0, shape (..., hidden_dim)
-            - "ubinary": uint8 dtype, packed bits (8x smaller), shape (..., hidden_dim // 8)
         """
         if not isinstance(documents, list) or not all(
@@ -188,21 +186,13 @@ class PPLXQwen3ContextualModel(PPLXQwen3Model):
                 "Input 'documents' must be a list of lists of strings for contextual encoding."
             )
-        if quantization not in ["int8", "binary", "ubinary"]:
             raise ValueError(
                 f"Unsupported quantization type: '{quantization}'. "
-                f"Supported types are: 'int8', 'binary', 'ubinary'. "
                 f"Got: {type(quantization).__name__} = '{quantization}'"
             )
-        if normalize_embeddings and quantization == "ubinary":
-            raise ValueError(
-                "normalize_embeddings=True is incompatible with quantization='ubinary'. "
-                "Packed binary embeddings (uint8) cannot be normalized because each byte "
-                "represents 8 packed bits, not a single dimension. "
-                "Either set normalize_embeddings=False or use 'binary' quantization instead."
-            )
         self.eval()
         if device is None:

         device: str | torch.device | None = None,
         normalize_embeddings: bool = False,
         convert_to_numpy: bool = True,
+        quantization: Literal["int8", "binary"] = "int8",
     ) -> list[np.ndarray] | list[torch.Tensor]:
         """
         Encode documents with late chunking (contextual embeddings).
             convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
             quantization: Quantization type to apply. Options:
                 - "int8": Int8 tanh quantization (default)
+                - "binary": Binary tanh quantization
         Returns:
             List of numpy arrays or tensors (preserves document structure).
+            Each element has shape (n_chunks, hidden_dim).
+            embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
             Output type depends on quantization method:
+            - Int8: int8 values in range [-128, 127]
+            - Binary: float values -1.0 or 1.0
         """
         if not isinstance(documents, list) or not all(
                 "Input 'documents' must be a list of lists of strings for contextual encoding."
             )
+        if quantization not in ["int8", "binary"]:
             raise ValueError(
                 f"Unsupported quantization type: '{quantization}'. "
+                f"Supported types are: 'int8', 'binary'. "
                 f"Got: {type(quantization).__name__} = '{quantization}'"
             )
         self.eval()
         if device is None:

st_quantize.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import torch
-import numpy as np
 from typing import Literal
 from sentence_transformers.models import Module
@@ -67,46 +66,17 @@ class BinaryTanhQuantizer(Quantizer):
         return torch.where(x >= 0, 1.0, -1.0)
-class PackedBinaryQuantizer:
-    """
-    Packs binary embeddings into uint8 format for efficient storage.
-    This quantizer applies a binary threshold (x >= 0) and packs 8 consecutive
-    bits into a single uint8 byte using numpy.packbits. This reduces memory
-    usage by 8x compared to float32 and by 4x compared to int8.
-    IMPORTANT: This is an inference-only quantizer - it is not differentiable
-    and should only be used for encoding/inference, not during training.
-    Args:
-        x: Input tensor of any float dtype, shape (..., embedding_dim)
-    Returns:
-        Packed binary tensor of dtype uint8, shape (..., embedding_dim // 8)
-    Example:
-        >>> quantizer = PackedBinaryQuantizer()
-        >>> embeddings = torch.randn(2, 1024)  # float32
-        >>> packed = quantizer(embeddings)     # uint8, shape (2, 128)
-    """
-    def __call__(self, x: torch.Tensor) -> torch.Tensor:
-        bits = np.where(x.cpu().numpy() >= 0, True, False)
-        packed = np.packbits(bits, axis=-1)
-        return torch.from_numpy(packed).to(x.device)
 class FlexibleQuantizer(Module):
     def __init__(self):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
-        self._packed_binary_quantizer = PackedBinaryQuantizer()
     def forward(
         self,
         features: dict[str, torch.Tensor],
-        quantization: Literal["int8", "binary", "ubinary"] = "int8",
-        **kwargs,
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
@@ -116,13 +86,9 @@ class FlexibleQuantizer(Module):
             features["sentence_embedding"] = self._binary_quantizer(
                 features["sentence_embedding"]
             )
-        elif quantization == "ubinary":
-            features["sentence_embedding"] = self._packed_binary_quantizer(
-                features["sentence_embedding"]
-            )
         else:
             raise ValueError(
-                f"Invalid quantization type: {quantization}. Must be 'binary', 'ubinary', or 'int8'."
             )
         return features
@@ -138,6 +104,6 @@ class FlexibleQuantizer(Module):
         **kwargs,
     ):
         return cls()
-    def save(self, output_path: str, *args, **kwargs) -> None:
         return

 import torch
 from typing import Literal
 from sentence_transformers.models import Module
         return torch.where(x >= 0, 1.0, -1.0)
 class FlexibleQuantizer(Module):
     def __init__(self):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
     def forward(
         self,
         features: dict[str, torch.Tensor],
+        quantization: Literal["binary", "int8"] = "int8",
+        **kwargs
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
             features["sentence_embedding"] = self._binary_quantizer(
                 features["sentence_embedding"]
             )
         else:
             raise ValueError(
+                f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."
             )
         return features
         **kwargs,
     ):
         return cls()
+    def save(self, output_path: str, *args, **kwargs) -> None:
         return