perplexity-ai
/

pplx-embed-context-v1-4b

@@ -34,8 +34,8 @@ language:
 |:-----:|:----------:|:-------:|:---:|:------------:|:-----------:|:-------:|
 | `pplx-embed-v1-0.6B` | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
 | `pplx-embed-v1-4B` | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
-| `pplx-embed-context-v1-0.6B` | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
-| `pplx-embed-context-v1-4B` | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
 <sub>All models are built on diffusion continued pre-trained Qwen3 at Perplexity AI.</sub>

 |:-----:|:----------:|:-------:|:---:|:------------:|:-----------:|:-------:|
 | `pplx-embed-v1-0.6B` | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
 | `pplx-embed-v1-4B` | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
+| `pplx-embed-context-v1-0.6B` | 1024 | 32K | Yes | INT8/BINARY/UBINARY  | No | Mean |
+| `pplx-embed-context-v1-4B` | 2560 | 32K | Yes | INT8/BINAR/UBINARY Y | No | Mean |
 <sub>All models are built on diffusion continued pre-trained Qwen3 at Perplexity AI.</sub>

modeling.py CHANGED Viewed

@@ -12,6 +12,7 @@ from transformers import AutoTokenizer
 from .st_quantize import FlexibleQuantizer
 def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable:
     """
     This creates bidirectional attention mask.
@@ -141,7 +142,7 @@ class PPLXQwen3ContextualModel(PPLXQwen3Model):
         device: str | torch.device | None = None,
         normalize_embeddings: bool = False,
         convert_to_numpy: bool = True,
-        quantization: Literal["int8", "binary"] = "int8",
     ) -> list[np.ndarray] | list[torch.Tensor]:
         """
         Encode documents with late chunking (contextual embeddings).
@@ -167,15 +168,17 @@ class PPLXQwen3ContextualModel(PPLXQwen3Model):
             convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
             quantization: Quantization type to apply. Options:
                 - "int8": Int8 tanh quantization (default)
-                - "binary": Binary tanh quantization
         Returns:
             List of numpy arrays or tensors (preserves document structure).
-            Each element has shape (n_chunks, hidden_dim).
-            embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
             Output type depends on quantization method:
-            - Int8: int8 values in range [-128, 127]
-            - Binary: float values -1.0 or 1.0
         """
         if not isinstance(documents, list) or not all(
@@ -185,13 +188,21 @@ class PPLXQwen3ContextualModel(PPLXQwen3Model):
                 "Input 'documents' must be a list of lists of strings for contextual encoding."
             )
-        if quantization not in ["int8", "binary"]:
             raise ValueError(
                 f"Unsupported quantization type: '{quantization}'. "
-                f"Supported types are: 'int8', 'binary'. "
                 f"Got: {type(quantization).__name__} = '{quantization}'"
             )
         self.eval()
         if device is None:

 from .st_quantize import FlexibleQuantizer
+# From modeling_t5gemma.py
 def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable:
     """
     This creates bidirectional attention mask.
         device: str | torch.device | None = None,
         normalize_embeddings: bool = False,
         convert_to_numpy: bool = True,
+        quantization: Literal["int8", "binary", "ubinary"] = "int8",
     ) -> list[np.ndarray] | list[torch.Tensor]:
         """
         Encode documents with late chunking (contextual embeddings).
             convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
             quantization: Quantization type to apply. Options:
                 - "int8": Int8 tanh quantization (default)
+                - "binary": Binary tanh quantization (-1.0 or 1.0)
+                - "ubinary": Unsigned packed binary (uint8, 8x compression)
         Returns:
             List of numpy arrays or tensors (preserves document structure).
+            Each element has shape (n_chunks, hidden_dim) or (n_chunks, hidden_dim // 8) for ubinary.
+            Example: embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
             Output type depends on quantization method:
+            - "int8": int8 dtype, values in range [-128, 127], shape (..., hidden_dim)
+            - "binary": float32 dtype, values -1.0 or 1.0, shape (..., hidden_dim)
+            - "ubinary": uint8 dtype, packed bits (8x smaller), shape (..., hidden_dim // 8)
         """
         if not isinstance(documents, list) or not all(
                 "Input 'documents' must be a list of lists of strings for contextual encoding."
             )
+        if quantization not in ["int8", "binary", "ubinary"]:
             raise ValueError(
                 f"Unsupported quantization type: '{quantization}'. "
+                f"Supported types are: 'int8', 'binary', 'ubinary'. "
                 f"Got: {type(quantization).__name__} = '{quantization}'"
             )
+        if normalize_embeddings and quantization == "ubinary":
+            raise ValueError(
+                "normalize_embeddings=True is incompatible with quantization='ubinary'. "
+                "Packed binary embeddings (uint8) cannot be normalized because each byte "
+                "represents 8 packed bits, not a single dimension. "
+                "Either set normalize_embeddings=False or use 'binary' quantization instead."
+            )
         self.eval()
         if device is None:

st_quantize.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import torch
 from typing import Literal
 from sentence_transformers.models import Module
@@ -66,17 +67,46 @@ class BinaryTanhQuantizer(Quantizer):
         return torch.where(x >= 0, 1.0, -1.0)
 class FlexibleQuantizer(Module):
     def __init__(self):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
     def forward(
         self,
         features: dict[str, torch.Tensor],
-        quantization: Literal["binary", "int8"] = "int8",
-        **kwargs
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
@@ -86,9 +116,13 @@ class FlexibleQuantizer(Module):
             features["sentence_embedding"] = self._binary_quantizer(
                 features["sentence_embedding"]
             )
         else:
             raise ValueError(
-                f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."
             )
         return features
@@ -104,6 +138,6 @@ class FlexibleQuantizer(Module):
         **kwargs,
     ):
         return cls()
-    def save(self, output_path: str, *args, **kwargs) -> None:
         return

 import torch
+import numpy as np
 from typing import Literal
 from sentence_transformers.models import Module
         return torch.where(x >= 0, 1.0, -1.0)
+class PackedBinaryQuantizer:
+    """
+    Packs binary embeddings into uint8 format for efficient storage.
+    This quantizer applies a binary threshold (x >= 0) and packs 8 consecutive
+    bits into a single uint8 byte using numpy.packbits. This reduces memory
+    usage by 8x compared to float32 and by 4x compared to int8.
+    IMPORTANT: This is an inference-only quantizer - it is not differentiable
+    and should only be used for encoding/inference, not during training.
+    Args:
+        x: Input tensor of any float dtype, shape (..., embedding_dim)
+    Returns:
+        Packed binary tensor of dtype uint8, shape (..., embedding_dim // 8)
+    Example:
+        >>> quantizer = PackedBinaryQuantizer()
+        >>> embeddings = torch.randn(2, 1024)  # float32
+        >>> packed = quantizer(embeddings)     # uint8, shape (2, 128)
+    """
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        bits = np.where(x.cpu().numpy() >= 0, True, False)
+        packed = np.packbits(bits, axis=-1)
+        return torch.from_numpy(packed).to(x.device)
 class FlexibleQuantizer(Module):
     def __init__(self):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
+        self._packed_binary_quantizer = PackedBinaryQuantizer()
     def forward(
         self,
         features: dict[str, torch.Tensor],
+        quantization: Literal["int8", "binary", "ubinary"] = "int8",
+        **kwargs,
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
             features["sentence_embedding"] = self._binary_quantizer(
                 features["sentence_embedding"]
             )
+        elif quantization == "ubinary":
+            features["sentence_embedding"] = self._packed_binary_quantizer(
+                features["sentence_embedding"]
+            )
         else:
             raise ValueError(
+                f"Invalid quantization type: {quantization}. Must be 'binary', 'ubinary', or 'int8'."
             )
         return features
         **kwargs,
     ):
         return cls()
+    def save(self, output_path: str, *args, **kwargs) -> None:
         return