Add new modeling with contextual encoding

by seslami-pplx - opened Jan 29

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+399

-277

Files changed (6) hide show

config.json +5 -7
configuration.py +5 -0
configuration_qwen3.py +0 -206
modeling.py +336 -0
modules.json +3 -2
st_quantize.py +50 -62

config.json CHANGED Viewed

@@ -1,13 +1,12 @@
 {
   "architectures": [
-    "Qwen3Model"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_qwen3.Qwen3Config",
-    "AutoModel": "perplexity-ai/bidirectional-qwen3-implementation--modeling_qwen3.Qwen3Model",
-    "AutoModelForMaskedLM": "modeling_qwen3.Qwen3ForMaskedLM"
   },
   "bos_token_id": 151643,
   "dtype": "float32",
@@ -49,8 +48,7 @@
   ],
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
-  "mlm_loss_variant": "elbo_normalize",
-  "model_type": "qwen3",
   "num_attention_heads": 16,
   "num_hidden_layers": 28,
   "num_key_value_heads": 8,
@@ -65,6 +63,6 @@
   "transformers_version": "5.0.0.dev0",
   "use_cache": false,
   "use_sliding_window": false,
-  "variant": "bidirectional",
   "vocab_size": 151936
 }

 {
   "architectures": [
+    "PPLXQwen3Model"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration.PPLXQwen3Config",
+    "AutoModel": "modeling.PPLXQwen3ContextualModel"
   },
   "bos_token_id": 151643,
   "dtype": "float32",
   ],
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
+  "model_type": "bidirectional_pplx_qwen3",
   "num_attention_heads": 16,
   "num_hidden_layers": 28,
   "num_key_value_heads": 8,
   "transformers_version": "5.0.0.dev0",
   "use_cache": false,
   "use_sliding_window": false,
+  "attn_implementation": "sdpa",
   "vocab_size": 151936
 }

configuration.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+class PPLXQwen3Config(Qwen3Config):
+    model_type = "bidirectional_pplx_qwen3"

configuration_qwen3.py DELETED Viewed

@@ -1,206 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Qwen3 model configuration"""
-from typing import Optional, Literal
-import warnings
-from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class Qwen3Config(PreTrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
-    Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen3-8B [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen3Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details, check out [this
-            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
-        head_dim (`int`, *optional*, defaults to 128):
-            The attention head dimension.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_parameters (`RopeParameters`, *optional*):
-            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain
-            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
-            with longer `max_position_embeddings`.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
-            additional layer afterwards will use SWA (Sliding Window Attention).
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from transformers import Qwen3Model, Qwen3Config
-    >>> # Initializing a Qwen3 style configuration
-    >>> configuration = Qwen3Config()
-    >>> # Initializing a model from the Qwen3-8B style configuration
-    >>> model = Qwen3Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "qwen3"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `Qwen3`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    def __init__(
-        self,
-        vocab_size: Optional[int] = 151936,
-        hidden_size: Optional[int] = 4096,
-        intermediate_size: Optional[int] = 22016,
-        num_hidden_layers: Optional[int] = 32,
-        num_attention_heads: Optional[int] = 32,
-        num_key_value_heads: Optional[int] = 32,
-        head_dim: Optional[int] = 128,
-        hidden_act: Optional[str] = "silu",
-        max_position_embeddings: Optional[int] = 32768,
-        initializer_range: Optional[float] = 0.02,
-        rms_norm_eps: Optional[int] = 1e-6,
-        use_cache: Optional[bool] = True,
-        tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
-        attention_bias: Optional[bool] = False,
-        use_sliding_window: Optional[bool] = False,
-        sliding_window: Optional[int] = 4096,
-        max_window_layers: Optional[int] = 28,
-        layer_types: Optional[list[str]] = None,
-        attention_dropout: Optional[float] = 0.0,
-        variant: Literal["causal", "bidirectional", "causal_dropout"] = "causal",
-        mlm_loss_variant: Literal["simple", "masked_normalize", "elbo_normalize", "flat_cart"] = "simple",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window if self.use_sliding_window else None
-        self.max_window_layers = max_window_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
-        rope_scaling = kwargs.pop("rope_scaling", None)
-        self.rope_parameters = rope_scaling or rope_parameters
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if self.sliding_window is not None and i >= self.max_window_layers
-                else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types, self.num_hidden_layers)
-        # Validate the correctness of rotary position embeddings parameters
-        rope_theta = kwargs.get("rope_theta", 10000.0)
-        standardize_rope_params(self, rope_theta=rope_theta)
-        rope_config_validation(self)
-        self.variant = variant
-        self.mlm_loss_variant = mlm_loss_variant
-        if mlm_loss_variant not in ["simple", "masked_normalize", "elbo_normalize", "flat_cart"]:
-            raise NotImplementedError(f"Loss variant {mlm_loss_variant} unknown")
-        if variant != "causal" and use_cache:
-            warnings.warn("Cannot use cache (use_cache) and bidirectional attention (is_causal=False)")
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-__all__ = ["Qwen3Config"]

modeling.py ADDED Viewed

	@@ -0,0 +1,336 @@

+from typing import Callable, Literal
+import numpy as np
+import torch
+from transformers import Qwen3Model
+from transformers.cache_utils import Cache
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs
+from .configuration import PPLXQwen3Config
+from transformers import AutoTokenizer
+from .st_quantize import FlexibleQuantizer
+# From modeling_t5gemma.py
+def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable:
+    """
+    This creates bidirectional attention mask.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        if attention_mask is None:
+            return torch.ones((), dtype=torch.bool)
+        return attention_mask[batch_idx, kv_idx].to(torch.bool)
+    return inner_mask
+class PPLXQwen3Model(Qwen3Model):
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    config_class = PPLXQwen3Config
+    def __init__(self, config):
+        super().__init__(config)
+        self.post_init()
+    def post_init(self):
+        super().post_init()
+        # Override to set all layers to non-causal attention. This'll work with attn_implementation="flash_attention_2" or "sdpa"
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            input_ids = None
+        # We construct a dummy tensor imitating initial positions
+        dummy_cache_position = torch.arange(
+            inputs_embeds.shape[1], device=inputs_embeds.device, dtype=torch.long
+        )
+        attention_mask = {
+            "full_attention": create_causal_mask(
+                config=self.config,
+                input_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                cache_position=dummy_cache_position,
+                past_key_values=None,
+                position_ids=position_ids,
+                or_mask_function=bidirectional_mask_function(attention_mask),
+            )
+        }
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        return outputs
+class PPLXQwen3ContextualModel(PPLXQwen3Model):
+    """
+    Qwen3 model with contextual encoding support for late chunking.
+    This model extends PPLXQwen3Model with an encode() method that supports both
+    standard encoding (list[str]) and contextual encoding (list[list[str]]) with late chunking.
+    IMPORTANT: This model MUST be loaded with trust_remote_code=True:
+        from transformers import AutoModel
+        model = AutoModel.from_pretrained(
+            "path/to/model",
+            trust_remote_code=True  # REQUIRED!
+        )
+        embeddings = model.encode([["chunk1", "chunk2"]])
+    Loading without trust_remote_code=True will fail to load this custom model class.
+    """
+    config_class = PPLXQwen3Config
+    def __init__(self, config):
+        super().__init__(config)
+        if not isinstance(config, PPLXQwen3Config):
+            raise TypeError(
+                f"PPLXQwen3ContextualModel requires PPLXQwen3Config, got {type(config).__name__}. "
+                f"Did you forget to load with trust_remote_code=True?"
+            )
+        self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
+        self._flexible_quantizer = FlexibleQuantizer()
+    @staticmethod
+    def mean_pooling(
+        token_embeddings: torch.Tensor, attention_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply mean pooling to token embeddings."""
+        input_mask_expanded = (
+            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        )
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+            input_mask_expanded.sum(1), min=1e-9
+        )
+    @torch.inference_mode()
+    def encode(
+        self,
+        documents: list[list[str]],
+        batch_size: int = 32,
+        show_progress_bar: bool = False,
+        device: str | torch.device | None = None,
+        normalize_embeddings: bool = False,
+        convert_to_numpy: bool = True,
+        quantization: Literal["int8", "binary"] = "int8",
+    ) -> list[np.ndarray] | list[torch.Tensor]:
+        """
+        Encode documents with late chunking (contextual embeddings).
+        This model is designed specifically for contextual encoding and always expects
+        documents as nested lists where each document is a list of text chunks.
+        The encoding process:
+        1. Concatenate chunks with separator tokens
+        2. Run forward pass to get token embeddings
+        3. Extract and pool individual chunk embeddings (late chunking)
+        4. Apply quantization (Int8 or binary, always enabled)
+        5. Normalize embeddings if requested (applied after quantization)
+        6. Convert to numpy or return as tensors
+        Args:
+            documents: List of documents, where each document is a list of text chunks.
+                Example: [["chunk1", "chunk2"], ["chunk1", "chunk2", "chunk3"]]
+            batch_size: Batch size for encoding
+            show_progress_bar: Show progress bar during encoding
+            device: Device to use for computation (defaults to model's device)
+            normalize_embeddings: Normalize embeddings to unit length (applied after quantization)
+            convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
+            quantization: Quantization type to apply. Options:
+                - "int8": Int8 tanh quantization (default)
+                - "binary": Binary tanh quantization
+        Returns:
+            List of numpy arrays or tensors (preserves document structure).
+            Each element has shape (n_chunks, hidden_dim).
+            embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
+            Output type depends on quantization method:
+            - Int8: int8 values in range [-128, 127]
+            - Binary: float values -1.0 or 1.0
+        """
+        if not isinstance(documents, list) or not all(
+            isinstance(doc, list) for doc in documents
+        ):
+            raise TypeError(
+                "Input 'documents' must be a list of lists of strings for contextual encoding."
+            )
+        if quantization not in ["int8", "binary"]:
+            raise ValueError(
+                f"Unsupported quantization type: '{quantization}'. "
+                f"Supported types are: 'int8', 'binary'. "
+                f"Got: {type(quantization).__name__} = '{quantization}'"
+            )
+        self.eval()
+        if device is None:
+            device = next(self.parameters()).device
+        all_embeddings = []
+        range_iter = range(0, len(documents), batch_size)
+        if show_progress_bar:
+            try:
+                from tqdm import tqdm
+                range_iter = tqdm(range_iter, desc="Encoding documents")
+            except ImportError:
+                pass
+        for i in range_iter:
+            batch_docs = documents[i : i + batch_size]
+            doc_strings = [
+                self.tokenizer.sep_token.join(chunks) for chunks in batch_docs
+            ]
+            inputs = self.tokenizer(
+                doc_strings,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+            )
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            outputs = self.forward(**inputs)
+            token_embeddings = outputs.last_hidden_state
+            batch_chunk_embeddings = self._extract_chunks_from_concatenated(
+                input_ids=inputs["input_ids"],
+                token_embeddings=token_embeddings,
+                attention_mask=inputs["attention_mask"],
+            )
+            batch_chunk_embeddings = [
+                torch.stack([chunk for chunk in doc_chunks], dim=0)
+                for doc_chunks in batch_chunk_embeddings
+            ]
+            batch_chunk_embeddings = [
+                self._flexible_quantizer(
+                    {"sentence_embedding": emb}, quantization=quantization
+                )["sentence_embedding"]
+                for emb in batch_chunk_embeddings
+            ]
+            if normalize_embeddings:
+                batch_chunk_embeddings = [
+                    torch.nn.functional.normalize(emb, p=2, dim=-1)
+                    for emb in batch_chunk_embeddings
+                ]
+            batch_chunk_embeddings = [emb.cpu() for emb in batch_chunk_embeddings]
+            all_embeddings.extend(batch_chunk_embeddings)
+        if convert_to_numpy:
+            all_embeddings = [emb.numpy() for emb in all_embeddings]
+        return all_embeddings
+    def _extract_chunks_from_concatenated(
+        self,
+        input_ids: torch.Tensor,
+        token_embeddings: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> list[list[torch.Tensor]]:
+        """
+        Extract individual chunk embeddings from concatenated sequence using late chunking.
+        This method splits concatenated sequences like "[chunk1][SEP][chunk2][SEP]..."
+        back into individual chunk embeddings by finding SEP token positions.
+        Args:
+            input_ids: Token IDs (batch_size, seq_len)
+            token_embeddings: Token embeddings (batch_size, seq_len, hidden_dim)
+            attention_mask: Attention mask (batch_size, seq_len)
+        Returns:
+            list[list[torch.Tensor]]: List of documents, each containing list of chunk embeddings
+        Note:
+            The sep_token_id is retrieved from self.tokenizer.sep_token_id.
+            Common values: Qwen2=151643, BERT=102, varies by tokenizer.
+        """
+        sep_token_id = self.tokenizer.sep_token_id
+        batch_size = input_ids.shape[0]
+        all_doc_chunks = []
+        for batch_idx in range(batch_size):
+            # non-pad sep tokens
+            valid_positions = attention_mask[batch_idx].bool()
+            sep_positions = (
+                (input_ids[batch_idx] == sep_token_id) & valid_positions
+            ).nonzero(as_tuple=True)[0]
+            chunk_embeddings = []
+            start_pos = 0
+            for sep_pos in sep_positions:
+                chunk_tokens = token_embeddings[batch_idx, start_pos:sep_pos]
+                chunk_mask = attention_mask[batch_idx, start_pos:sep_pos]
+                chunk_emb = self.mean_pooling(
+                    chunk_tokens.unsqueeze(0), chunk_mask.unsqueeze(0)
+                ).squeeze(0)
+                chunk_embeddings.append(chunk_emb)
+                start_pos = sep_pos + 1
+            # Handle the last chunk (after the last SEP token)
+            last_valid_pos = attention_mask[batch_idx].sum().item()
+            chunk_tokens = token_embeddings[batch_idx, start_pos:last_valid_pos]
+            chunk_mask = attention_mask[batch_idx, start_pos:last_valid_pos]
+            if chunk_mask.sum() > 0:
+                chunk_emb = self.mean_pooling(
+                    chunk_tokens.unsqueeze(0), chunk_mask.unsqueeze(0)
+                ).squeeze(0)
+            else:
+                # Empty chunk - create zero embedding
+                chunk_emb = torch.zeros(
+                    token_embeddings.shape[-1],
+                    device=token_embeddings.device,
+                    dtype=token_embeddings.dtype,
+                )
+            chunk_embeddings.append(chunk_emb)
+            all_doc_chunks.append(chunk_embeddings)
+        return all_doc_chunks

modules.json CHANGED Viewed

@@ -15,6 +15,7 @@
     "idx": 2,
     "name": "2",
     "path": "",
-    "type": "st_quantize.UnnormalizedInt8TanhQuantizer"
   }
-]

     "idx": 2,
     "name": "2",
     "path": "",
+    "type": "st_quantize.FlexibleQuantizer",
+    "kwargs": ["quantization"]
   }
+]

st_quantize.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import torch
-from torch import nn
-from typing import Optional
 from typing import Literal
 class Quantizer(torch.nn.Module):
@@ -26,9 +25,7 @@ class Quantizer(torch.nn.Module):
             result = soft
         else:
             result = (
-                self._hard_quantize(x, *args, **kwargs).detach()
-                + soft
-                - soft.detach()
             )
         return result
@@ -37,85 +34,76 @@ class Quantizer(torch.nn.Module):
 class Int8TanhQuantizer(Quantizer):
     def __init__(
         self,
-        normalize: bool = False,
         hard: bool = True,
     ):
         super().__init__(hard=hard)
         self.qmin = -128
         self.qmax = 127
-        self._normalize = normalize
     def _soft_quantize(self, x, *args, **kwargs):
-        if self._normalize:
-            x = (x - x.mean(dim=-1, keepdim=True)) / (
-                x.std(dim=-1, keepdim=True) + 1e-8
-            )
         return torch.tanh(x)
     def _hard_quantize(self, x, *args, **kwargs):
         soft = self._soft_quantize(x)
         int_x = torch.round(soft * self.qmax)
         int_x = torch.clamp(int_x, self.qmin, self.qmax)
-        return int_x / self.qmax
-class UnnormalizedInt8TanhQuantizer(Int8TanhQuantizer):
-    def __init__(self):
-        super().__init__()
-        self.quantizer = Int8TanhQuantizer(normalize=False)
-    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        features["sentence_embedding"] = self.quantizer(
-            features["sentence_embedding"]
-        )
-        return features
-    @classmethod
-    def load(cls, input_path: str) -> "PoolAndQuantize":
-        return cls()
-class NormalizedInt8TanhQuantizer(Int8TanhQuantizer):
-    def __init__(self):
-        super().__init__()
-        self.quantizer = Int8TanhQuantizer(normalize=True)
-    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        features["sentence_embedding"] = self.quantizer(
-            features["sentence_embedding"]
-        )
-        return features
-    @classmethod
-    def load(cls, input_path: str) -> "PoolAndQuantize":
-        return cls()
-class Binarizer(Quantizer):
-    def __init__(self, tanh_scale: float = 1.0, **kwargs):
-        super().__init__(**kwargs)
-        self._tanh_scale = tanh_scale
-    def _hard_quantize(self, x, *args, **kwargs) -> torch.Tensor:
-        return torch.where(x > 0, 1.0, -1.0)
-    def _soft_quantize(self, x, *args, **kwargs) -> torch.Tensor:
-        return torch.tanh(x * self._tanh_scale)
-class UnnormalizedBinarizer(nn.Module):
-    def __init__(self, tanh_scale: float = 1.0, hard: bool = True):
         super().__init__()
-        self.quantizer = Binarizer(tanh_scale=tanh_scale, hard=hard)
-    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        features["sentence_embedding"] = self.quantizer(
-            features["sentence_embedding"]
-        )
         return features
     @classmethod
-    def load(cls, input_path: str) -> "UnnormalizedBinarizer":
         return cls()

 import torch
 from typing import Literal
+from sentence_transformers.models import Module
 class Quantizer(torch.nn.Module):
             result = soft
         else:
             result = (
+                self._hard_quantize(x, *args, **kwargs).detach() + soft - soft.detach()
             )
         return result
 class Int8TanhQuantizer(Quantizer):
     def __init__(
         self,
         hard: bool = True,
     ):
         super().__init__(hard=hard)
         self.qmin = -128
         self.qmax = 127
     def _soft_quantize(self, x, *args, **kwargs):
         return torch.tanh(x)
     def _hard_quantize(self, x, *args, **kwargs):
         soft = self._soft_quantize(x)
         int_x = torch.round(soft * self.qmax)
         int_x = torch.clamp(int_x, self.qmin, self.qmax)
+        return int_x
+class BinaryTanhQuantizer(Quantizer):
+    def __init__(
+        self,
+        hard: bool = True,
+        scale: float = 1.0,
+    ):
+        super().__init__(hard)
+        self._scale = scale
+    def _soft_quantize(self, x, *args, **kwargs):
+        return torch.tanh(self._scale * x)
+    def _hard_quantize(self, x, *args, **kwargs):
+        return torch.where(x >= 0, 1.0, -1.0)
+class FlexibleQuantizer(Module):
+    def __init__(self):
         super().__init__()
+        self._int8_quantizer = Int8TanhQuantizer()
+        self._binary_quantizer = BinaryTanhQuantizer()
+    def forward(
+        self,
+        features: dict[str, torch.Tensor],
+        quantization: Literal["binary", "int8"] = "int8",
+        **kwargs
+    ) -> dict[str, torch.Tensor]:
+        if quantization == "int8":
+            features["sentence_embedding"] = self._int8_quantizer(
+                features["sentence_embedding"]
+            )
+        elif quantization == "binary":
+            features["sentence_embedding"] = self._binary_quantizer(
+                features["sentence_embedding"]
+            )
+        else:
+            raise ValueError(
+                f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."
+            )
         return features
     @classmethod
+    def load(
+        cls,
+        model_name_or_path: str,
+        subfolder: str = "",
+        token: bool | str | None = None,
+        cache_folder: str | None = None,
+        revision: str | None = None,
+        local_files_only: bool = False,
+        **kwargs,
+    ):
         return cls()
+    def save(self, output_path: str, *args, **kwargs) -> None:
+        return