feat: new model implementation (#1)

- fix: added new implementation (ff0893c1cd4f25a76e5392e7995d9219a9aed482)
- feat: updated context model (8d01c688fc64bae72a41575570dd514b7454a033)
- refactor: new modeling code (12fc1ef7c6890644f5fc6a691fc24bd001442d95)

Files changed (6) hide show

config.json +6 -8
configuration.py +5 -0
configuration_qwen3.py +0 -206
modeling.py +83 -0
modules.json +2 -1
st_quantize.py +50 -62

config.json CHANGED Viewed

@@ -1,13 +1,12 @@
 {
   "architectures": [
-    "Qwen3Model"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_qwen3.Qwen3Config",
-    "AutoModel": "perplexity-ai/bidirectional-qwen3-implementation--modeling_qwen3.Qwen3Model",
-    "AutoModelForMaskedLM": "modeling_qwen3.Qwen3ForMaskedLM"
   },
   "bos_token_id": 151643,
   "dtype": "float32",
@@ -57,8 +56,7 @@
   ],
   "max_position_embeddings": 32768,
   "max_window_layers": 36,
-  "mlm_loss_variant": "elbo_normalize",
-  "model_type": "qwen3",
   "num_attention_heads": 32,
   "num_hidden_layers": 36,
   "num_key_value_heads": 8,
@@ -73,6 +71,6 @@
   "transformers_version": "5.0.0.dev0",
   "use_cache": false,
   "use_sliding_window": false,
-  "variant": "bidirectional",
-  "vocab_size": 151936
 }

 {
   "architectures": [
+    "PPLXQwen3Model"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration.PPLXQwen3Config",
+    "AutoModel": "modeling.PPLXQwen3Model"
   },
   "bos_token_id": 151643,
   "dtype": "float32",
   ],
   "max_position_embeddings": 32768,
   "max_window_layers": 36,
+  "model_type": "bidirectional_pplx_qwen3",
   "num_attention_heads": 32,
   "num_hidden_layers": 36,
   "num_key_value_heads": 8,
   "transformers_version": "5.0.0.dev0",
   "use_cache": false,
   "use_sliding_window": false,
+  "vocab_size": 151936,
+  "attn_implementation": "sdpa"
 }

configuration.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+class PPLXQwen3Config(Qwen3Config):
+    model_type = "bidirectional_pplx_qwen3"

configuration_qwen3.py DELETED Viewed

@@ -1,206 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Qwen3 model configuration"""
-from typing import Optional, Literal
-import warnings
-from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class Qwen3Config(PreTrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
-    Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen3-8B [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
-    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PreTrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen3Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details, check out [this
-            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
-        head_dim (`int`, *optional*, defaults to 128):
-            The attention head dimension.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_parameters (`RopeParameters`, *optional*):
-            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain
-            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
-            with longer `max_position_embeddings`.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
-            additional layer afterwards will use SWA (Sliding Window Attention).
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from transformers import Qwen3Model, Qwen3Config
-    >>> # Initializing a Qwen3 style configuration
-    >>> configuration = Qwen3Config()
-    >>> # Initializing a model from the Qwen3-8B style configuration
-    >>> model = Qwen3Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "qwen3"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `Qwen3`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    def __init__(
-        self,
-        vocab_size: Optional[int] = 151936,
-        hidden_size: Optional[int] = 4096,
-        intermediate_size: Optional[int] = 22016,
-        num_hidden_layers: Optional[int] = 32,
-        num_attention_heads: Optional[int] = 32,
-        num_key_value_heads: Optional[int] = 32,
-        head_dim: Optional[int] = 128,
-        hidden_act: Optional[str] = "silu",
-        max_position_embeddings: Optional[int] = 32768,
-        initializer_range: Optional[float] = 0.02,
-        rms_norm_eps: Optional[int] = 1e-6,
-        use_cache: Optional[bool] = True,
-        tie_word_embeddings: Optional[bool] = False,
-        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
-        attention_bias: Optional[bool] = False,
-        use_sliding_window: Optional[bool] = False,
-        sliding_window: Optional[int] = 4096,
-        max_window_layers: Optional[int] = 28,
-        layer_types: Optional[list[str]] = None,
-        attention_dropout: Optional[float] = 0.0,
-        variant: Literal["causal", "bidirectional", "causal_dropout"] = "causal",
-        mlm_loss_variant: Literal["simple", "masked_normalize", "elbo_normalize", "flat_cart"] = "simple",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window if self.use_sliding_window else None
-        self.max_window_layers = max_window_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
-        rope_scaling = kwargs.pop("rope_scaling", None)
-        self.rope_parameters = rope_scaling or rope_parameters
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if self.sliding_window is not None and i >= self.max_window_layers
-                else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types, self.num_hidden_layers)
-        # Validate the correctness of rotary position embeddings parameters
-        rope_theta = kwargs.get("rope_theta", 10000.0)
-        standardize_rope_params(self, rope_theta=rope_theta)
-        rope_config_validation(self)
-        self.variant = variant
-        self.mlm_loss_variant = mlm_loss_variant
-        if mlm_loss_variant not in ["simple", "masked_normalize", "elbo_normalize", "flat_cart"]:
-            raise NotImplementedError(f"Loss variant {mlm_loss_variant} unknown")
-        if variant != "causal" and use_cache:
-            warnings.warn("Cannot use cache (use_cache) and bidirectional attention (is_causal=False)")
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-__all__ = ["Qwen3Config"]

modeling.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from typing import Callable
+import torch
+from transformers import Qwen3Model
+from transformers.cache_utils import Cache
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs
+from .configuration import PPLXQwen3Config
+# From modeling_t5gemma.py
+def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable:
+    """
+    This creates bidirectional attention mask.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        if attention_mask is None:
+            return torch.ones((), dtype=torch.bool)
+        return attention_mask[batch_idx, kv_idx].to(torch.bool)
+    return inner_mask
+class PPLXQwen3Model(Qwen3Model):
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    config_class = PPLXQwen3Config
+    def __init__(self, config):
+        super().__init__(config)
+        self.post_init()
+    def post_init(self):
+        super().post_init()
+        # Override to set all layers to non-causal attention. This'll work with attn_implementation="flash_attention_2" or "sdpa"
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            input_ids = None
+        # We construct a dummy tensor imitating initial positions
+        dummy_cache_position = torch.arange(
+            inputs_embeds.shape[1], device=inputs_embeds.device, dtype=torch.long
+        )
+        attention_mask = {
+            "full_attention": create_causal_mask(
+                config=self.config,
+                input_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                cache_position=dummy_cache_position,
+                past_key_values=None,
+                position_ids=position_ids,
+                or_mask_function=bidirectional_mask_function(attention_mask),
+            )
+        }
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        return outputs

modules.json CHANGED Viewed

@@ -15,6 +15,7 @@
     "idx": 2,
     "name": "2",
     "path": "",
-    "type": "st_quantize.UnnormalizedInt8TanhQuantizer"
   }
 ]

     "idx": 2,
     "name": "2",
     "path": "",
+    "type": "st_quantize.FlexibleQuantizer",
+    "kwargs": ["quantization"]
   }
 ]

st_quantize.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import torch
-from torch import nn
-from typing import Optional
 from typing import Literal
 class Quantizer(torch.nn.Module):
@@ -26,9 +25,7 @@ class Quantizer(torch.nn.Module):
             result = soft
         else:
             result = (
-                self._hard_quantize(x, *args, **kwargs).detach()
-                + soft
-                - soft.detach()
             )
         return result
@@ -37,85 +34,76 @@ class Quantizer(torch.nn.Module):
 class Int8TanhQuantizer(Quantizer):
     def __init__(
         self,
-        normalize: bool = False,
         hard: bool = True,
     ):
         super().__init__(hard=hard)
         self.qmin = -128
         self.qmax = 127
-        self._normalize = normalize
     def _soft_quantize(self, x, *args, **kwargs):
-        if self._normalize:
-            x = (x - x.mean(dim=-1, keepdim=True)) / (
-                x.std(dim=-1, keepdim=True) + 1e-8
-            )
         return torch.tanh(x)
     def _hard_quantize(self, x, *args, **kwargs):
         soft = self._soft_quantize(x)
         int_x = torch.round(soft * self.qmax)
         int_x = torch.clamp(int_x, self.qmin, self.qmax)
-        return int_x / self.qmax
-class UnnormalizedInt8TanhQuantizer(Int8TanhQuantizer):
-    def __init__(self):
-        super().__init__()
-        self.quantizer = Int8TanhQuantizer(normalize=False)
-    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        features["sentence_embedding"] = self.quantizer(
-            features["sentence_embedding"]
-        )
-        return features
-    @classmethod
-    def load(cls, input_path: str) -> "PoolAndQuantize":
-        return cls()
-class NormalizedInt8TanhQuantizer(Int8TanhQuantizer):
-    def __init__(self):
-        super().__init__()
-        self.quantizer = Int8TanhQuantizer(normalize=True)
-    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        features["sentence_embedding"] = self.quantizer(
-            features["sentence_embedding"]
-        )
-        return features
-    @classmethod
-    def load(cls, input_path: str) -> "PoolAndQuantize":
-        return cls()
-class Binarizer(Quantizer):
-    def __init__(self, tanh_scale: float = 1.0, **kwargs):
-        super().__init__(**kwargs)
-        self._tanh_scale = tanh_scale
-    def _hard_quantize(self, x, *args, **kwargs) -> torch.Tensor:
-        return torch.where(x > 0, 1.0, -1.0)
-    def _soft_quantize(self, x, *args, **kwargs) -> torch.Tensor:
-        return torch.tanh(x * self._tanh_scale)
-class UnnormalizedBinarizer(nn.Module):
-    def __init__(self, tanh_scale: float = 1.0, hard: bool = True):
         super().__init__()
-        self.quantizer = Binarizer(tanh_scale=tanh_scale, hard=hard)
-    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        features["sentence_embedding"] = self.quantizer(
-            features["sentence_embedding"]
-        )
         return features
     @classmethod
-    def load(cls, input_path: str) -> "UnnormalizedBinarizer":
         return cls()

 import torch
 from typing import Literal
+from sentence_transformers.models import Module
 class Quantizer(torch.nn.Module):
             result = soft
         else:
             result = (
+                self._hard_quantize(x, *args, **kwargs).detach() + soft - soft.detach()
             )
         return result
 class Int8TanhQuantizer(Quantizer):
     def __init__(
         self,
         hard: bool = True,
     ):
         super().__init__(hard=hard)
         self.qmin = -128
         self.qmax = 127
     def _soft_quantize(self, x, *args, **kwargs):
         return torch.tanh(x)
     def _hard_quantize(self, x, *args, **kwargs):
         soft = self._soft_quantize(x)
         int_x = torch.round(soft * self.qmax)
         int_x = torch.clamp(int_x, self.qmin, self.qmax)
+        return int_x
+class BinaryTanhQuantizer(Quantizer):
+    def __init__(
+        self,
+        hard: bool = True,
+        scale: float = 1.0,
+    ):
+        super().__init__(hard)
+        self._scale = scale
+    def _soft_quantize(self, x, *args, **kwargs):
+        return torch.tanh(self._scale * x)
+    def _hard_quantize(self, x, *args, **kwargs):
+        return torch.where(x >= 0, 1.0, -1.0)
+class FlexibleQuantizer(Module):
+    def __init__(self):
         super().__init__()
+        self._int8_quantizer = Int8TanhQuantizer()
+        self._binary_quantizer = BinaryTanhQuantizer()
+    def forward(
+        self,
+        features: dict[str, torch.Tensor],
+        quantization: Literal["binary", "int8"] = "int8",
+        **kwargs
+    ) -> dict[str, torch.Tensor]:
+        if quantization == "int8":
+            features["sentence_embedding"] = self._int8_quantizer(
+                features["sentence_embedding"]
+            )
+        elif quantization == "binary":
+            features["sentence_embedding"] = self._binary_quantizer(
+                features["sentence_embedding"]
+            )
+        else:
+            raise ValueError(
+                f"Invalid quantization type: {quantization}. Must be 'binary' or 'int8'."
+            )
         return features
     @classmethod
+    def load(
+        cls,
+        model_name_or_path: str,
+        subfolder: str = "",
+        token: bool | str | None = None,
+        cache_folder: str | None = None,
+        revision: str | None = None,
+        local_files_only: bool = False,
+        **kwargs,
+    ):
         return cls()
+    def save(self, output_path: str, *args, **kwargs) -> None:
+        return