perplexity-ai
/

pplx-embed-v1-0.6b

Feature Extraction

sentence-transformers

bidirectional_pplx_qwen3

sentence-similarity

text-embeddings-inference

Model card Files Files and versions

mkrimmel-pplx commited on Feb 4

Commit

2958ec7

·

1 Parent(s): ab9dcdc

feat: add quantization

Files changed (1) hide show

st_quantize.py +14 -1

st_quantize.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import torch
 from typing import Literal
 from sentence_transformers.models import Module
@@ -64,6 +65,13 @@ class BinaryTanhQuantizer(Quantizer):
     def _hard_quantize(self, x, *args, **kwargs):
         return torch.where(x >= 0, 1.0, -1.0)
 class FlexibleQuantizer(Module):
@@ -71,11 +79,12 @@ class FlexibleQuantizer(Module):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
     def forward(
         self,
         features: dict[str, torch.Tensor],
-        quantization: Literal["binary", "int8"] = "int8",
         **kwargs
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
@@ -86,6 +95,10 @@ class FlexibleQuantizer(Module):
             features["sentence_embedding"] = self._binary_quantizer(
                 features["sentence_embedding"]
             )
         else:
             raise ValueError(
                 f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."

 import torch
+import numpy as np
 from typing import Literal
 from sentence_transformers.models import Module
     def _hard_quantize(self, x, *args, **kwargs):
         return torch.where(x >= 0, 1.0, -1.0)
+class PackedBinaryQuantizer:
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        bits = np.where(x.cpu().numpy() >=  0, True, False)
+        packed = np.packbits(bits, axis=-1)
+        return torch.from_numpy(packed).to(x.device)
 class FlexibleQuantizer(Module):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self._binary_quantizer = BinaryTanhQuantizer()
+        self._packed_binary_quantizer = PackedBinaryQuantizer()
     def forward(
         self,
         features: dict[str, torch.Tensor],
+        quantization: Literal["int8", "binary", "ubinary"] = "int8",
         **kwargs
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._binary_quantizer(
                 features["sentence_embedding"]
             )
+        elif quantization == "ubinary":
+            features["sentence_embedding"] = self._packed_binary_quantizer(
+                features["sentence_embedding"]
+            )
         else:
             raise ValueError(
                 f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."