perplexity-ai
/

pplx-embed-v1-4b

@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
 import numpy as np
 import torch
@@ -26,7 +26,7 @@ from transformers.modeling_utils import PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from .configuration import PPLXQwen3Config
-from .st_quantize import Int8TanhQuantizer
 # Activation functions mapping
@@ -553,7 +553,7 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
         super().__init__(config)
         self.model = PPLXQwen3Model(config)
         self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
-        self.quantizer = Int8TanhQuantizer(hard=True)
         self.post_init()
     def forward(
@@ -594,6 +594,7 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
         device: str | torch.device | None = None,
         normalize_embeddings: bool = False,
         convert_to_numpy: bool = True,
     ) -> list[np.ndarray] | list[torch.Tensor]:
         """
         Encode documents with late chunking (contextual embeddings).
@@ -605,8 +606,9 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
         1. Concatenate chunks with separator tokens
         2. Run forward pass to get token embeddings
         3. Extract and pool individual chunk embeddings (late chunking)
-        4. Apply quantization (Int8 tanh quantization)
-        5. Convert to numpy or return as tensors
         Args:
             documents: List of documents, where each document is a list of text chunks.
@@ -614,14 +616,19 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
             batch_size: Batch size for encoding
             show_progress_bar: Show progress bar during encoding
             device: Device to use for computation (defaults to model's device)
-            normalize_embeddings: Normalize embeddings to unit length (applied before quantization)
             convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
         Returns:
             List of numpy arrays or tensors (preserves document structure).
             Each element has shape (n_chunks, hidden_dim).
             embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
-            With quantization, embeddings are int8 values in range [-128, 127].
         """
         if not isinstance(documents, list) or not all(
@@ -631,6 +638,13 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
                 "Input 'documents' must be a list of lists of strings for contextual encoding."
             )
         self.eval()
         if device is None:
@@ -676,10 +690,12 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
                 for doc_chunks in batch_chunk_embeddings
             ]
-            if self.quantizer is not None:
-                batch_chunk_embeddings = [
-                    self.quantizer(emb) for emb in batch_chunk_embeddings
-                ]
             if normalize_embeddings:
                 batch_chunk_embeddings = [
@@ -691,7 +707,6 @@ class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
             all_embeddings.extend(batch_chunk_embeddings)
-        # Convert to numpy if requested
         if convert_to_numpy:
             all_embeddings = [emb.numpy() for emb in all_embeddings]

 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional, Tuple, Literal
 import numpy as np
 import torch
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from .configuration import PPLXQwen3Config
+from .st_quantize import FlexibleQuantizer
 # Activation functions mapping
         super().__init__(config)
         self.model = PPLXQwen3Model(config)
         self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
+        self._flexible_quantizer = FlexibleQuantizer()
         self.post_init()
     def forward(
         device: str | torch.device | None = None,
         normalize_embeddings: bool = False,
         convert_to_numpy: bool = True,
+        quantization: Literal["int8", "binary"] = "int8",
     ) -> list[np.ndarray] | list[torch.Tensor]:
         """
         Encode documents with late chunking (contextual embeddings).
         1. Concatenate chunks with separator tokens
         2. Run forward pass to get token embeddings
         3. Extract and pool individual chunk embeddings (late chunking)
+        4. Apply quantization (Int8 or binary, always enabled)
+        5. Normalize embeddings if requested (applied after quantization)
+        6. Convert to numpy or return as tensors
         Args:
             documents: List of documents, where each document is a list of text chunks.
             batch_size: Batch size for encoding
             show_progress_bar: Show progress bar during encoding
             device: Device to use for computation (defaults to model's device)
+            normalize_embeddings: Normalize embeddings to unit length (applied after quantization)
             convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
+            quantization: Quantization type to apply. Options:
+                - "int8": Int8 tanh quantization (default)
+                - "binary": Binary tanh quantization
         Returns:
             List of numpy arrays or tensors (preserves document structure).
             Each element has shape (n_chunks, hidden_dim).
             embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
+            Output type depends on quantization method:
+            - Int8: int8 values in range [-128, 127]
+            - Binary: float values -1.0 or 1.0
         """
         if not isinstance(documents, list) or not all(
                 "Input 'documents' must be a list of lists of strings for contextual encoding."
             )
+        if quantization not in ["int8", "binary"]:
+            raise ValueError(
+                f"Unsupported quantization type: '{quantization}'. "
+                f"Supported types are: 'int8', 'binary'. "
+                f"Got: {type(quantization).__name__} = '{quantization}'"
+            )
         self.eval()
         if device is None:
                 for doc_chunks in batch_chunk_embeddings
             ]
+            batch_chunk_embeddings = [
+                self._flexible_quantizer(
+                    {"sentence_embedding": emb}, quantization=quantization
+                )["sentence_embedding"]
+                for emb in batch_chunk_embeddings
+            ]
             if normalize_embeddings:
                 batch_chunk_embeddings = [
             all_embeddings.extend(batch_chunk_embeddings)
         if convert_to_numpy:
             all_embeddings = [emb.numpy() for emb in all_embeddings]

st_quantize.py CHANGED Viewed

@@ -24,9 +24,7 @@ class Quantizer(torch.nn.Module):
             result = soft
         else:
             result = (
-                self._hard_quantize(x, *args, **kwargs).detach()
-                + soft
-                - soft.detach()
             )
         return result
@@ -53,13 +51,13 @@ class Int8TanhQuantizer(Quantizer):
 class BinaryTanhQuantizer(Quantizer):
     def __init__(
-        self,
         hard: bool = True,
         scale: float = 1.0,
     ):
         super().__init__(hard)
         self._scale = scale
     def _soft_quantize(self, x, *args, **kwargs):
         return torch.tanh(self._scale * x)
@@ -73,7 +71,11 @@ class FlexibleQuantizer(torch.nn.Module):
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
-    def forward(self, features: dict[str, torch.Tensor], quantization: Literal["binary", "int8"] = "int8") -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
                 features["sentence_embedding"]
@@ -83,10 +85,11 @@ class FlexibleQuantizer(torch.nn.Module):
                 features["sentence_embedding"]
             )
         else:
-            raise ValueError(f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'.")
         return features
     @classmethod
     def load(cls, input_path: str):
         return cls()

             result = soft
         else:
             result = (
+                self._hard_quantize(x, *args, **kwargs).detach() + soft - soft.detach()
             )
         return result
 class BinaryTanhQuantizer(Quantizer):
     def __init__(
+        self,
         hard: bool = True,
         scale: float = 1.0,
     ):
         super().__init__(hard)
         self._scale = scale
     def _soft_quantize(self, x, *args, **kwargs):
         return torch.tanh(self._scale * x)
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
+    def forward(
+        self,
+        features: dict[str, torch.Tensor],
+        quantization: Literal["binary", "int8"] = "int8",
+    ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
                 features["sentence_embedding"]
                 features["sentence_embedding"]
             )
         else:
+            raise ValueError(
+                f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."
+            )
         return features
     @classmethod
     def load(cls, input_path: str):
         return cls()