perplexity-ai
/

pplx-embed-v1-0.6b

@@ -1,128 +1,5 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This file has been modified from the original Qwen3 implementation.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-from transformers import PretrainedConfig
-class PPLXQwen3Config(PretrainedConfig):
-    """
-    PPLX configuration class for Qwen3Model compatible with transformers < 5.X.
-    This implementation only supports bidirectional attention (no causal or dropout variants).
-    Args:
-        vocab_size (int, optional, defaults to 151936):
-            Vocabulary size of the Qwen3 model.
-        hidden_size (int, optional, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (int, optional, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (int, optional, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (int, optional, defaults to 32):
-            Number of attention heads for each attention layer.
-        num_key_value_heads (int, optional, defaults to 32):
-            Number of key_value heads for Grouped Query Attention.
-        head_dim (int, optional, defaults to 128):
-            The attention head dimension.
-        hidden_act (str, optional, defaults to "silu"):
-            The non-linear activation function.
-        max_position_embeddings (int, optional, defaults to 32768):
-            The maximum sequence length.
-        initializer_range (float, optional, defaults to 0.02):
-            The standard deviation for weight initialization.
-        rms_norm_eps (float, optional, defaults to 1e-06):
-            The epsilon for rms normalization layers.
-        attention_bias (bool, optional, defaults to False):
-            Whether to use bias in attention projection layers.
-        attention_dropout (float, optional, defaults to 0.0):
-            The dropout ratio for attention probabilities.
-        rope_theta (float, optional, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        pad_token_id (int, optional):
-            The id of the padding token.
-        bos_token_id (int, optional):
-            The id of the beginning-of-sequence token.
-        eos_token_id (int, optional):
-            The id of the end-of-sequence token.
-        attn_implementation (str, optional):
-            The attention implementation to use. Options: "eager", "sdpa".
-            If None, will auto-select based on availability.
-    """
     model_type = "bidirectional_pplx_qwen3"
-    def __init__(
-        self,
-        vocab_size: Optional[int] = 151936,
-        hidden_size: Optional[int] = 4096,
-        intermediate_size: Optional[int] = 22016,
-        num_hidden_layers: Optional[int] = 32,
-        num_attention_heads: Optional[int] = 32,
-        num_key_value_heads: Optional[int] = 32,
-        head_dim: Optional[int] = 128,
-        hidden_act: Optional[str] = "silu",
-        max_position_embeddings: Optional[int] = 32768,
-        initializer_range: Optional[float] = 0.02,
-        rms_norm_eps: Optional[float] = 1e-6,
-        attention_bias: Optional[bool] = False,
-        attention_dropout: Optional[float] = 0.0,
-        rope_theta: Optional[float] = 10000.0,
-        pad_token_id: Optional[int] = None,
-        bos_token_id: Optional[int] = None,
-        eos_token_id: Optional[int] = None,
-        attn_implementation: Optional[str] = None,
-        **kwargs,
-    ):
-        # Extract attn_implementation from kwargs if not explicitly provided
-        if attn_implementation is None and 'attn_implementation' in kwargs:
-            attn_implementation = kwargs.pop('attn_implementation')
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.rope_theta = rope_theta
-        # Legacy: only bidirectional attention supported
-        self.is_causal = False
-        # Initialize parent class with token IDs
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
-        # Store attn_implementation as a regular attribute AFTER super().__init__() (will be serialized)
-        self.attn_implementation = attn_implementation
-# Register for AutoConfig
-PPLXQwen3Config.register_for_auto_class()
-__all__ = ["PPLXQwen3Config"]

+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+class PPLXQwen3Config(Qwen3Config):
     model_type = "bidirectional_pplx_qwen3"

modeling.py CHANGED Viewed

@@ -1,805 +1,83 @@
-# coding=utf-8
-# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This file has been modified from the original Qwen3 implementation.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, Tuple, Literal
-import numpy as np
 import torch
-from torch import nn
-import torch.nn.functional as F
-from transformers import AutoTokenizer
-from transformers.modeling_utils import PreTrainedModel
-from transformers.modeling_outputs import BaseModelOutputWithPast
 from .configuration import PPLXQwen3Config
-from .st_quantize import FlexibleQuantizer
-# Activation functions mapping
-ACT2FN = {
-    "silu": nn.functional.silu,
-    "gelu": nn.functional.gelu,
-    "relu": nn.functional.relu,
-}
-class PPLXQwen3RMSNorm(nn.Module):
-    """RMSNorm implementation compatible with transformers < 5.X"""
-    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-class PPLXQwen3MLP(nn.Module):
-    """MLP implementation compatible with transformers < 5.X"""
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-class PPLXQwen3RotaryEmbedding(nn.Module):
-    """Rotary Position Embedding implementation compatible with transformers < 5.X"""
-    def __init__(self, config, device=None):
-        super().__init__()
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-        self.config = config
-        # Check rope type and raise if not default
-        self.rope_type = self.config.rope_parameters["rope_type"]
-        if self.rope_type != "default":
-            raise NotImplementedError("Only default RoPE implemented")
-        # Compute inverse frequencies using the static method
-        inv_freq, self.attention_scaling = self.compute_default_rope_parameters(
-            config, device
-        )
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
-    @staticmethod
-    def compute_default_rope_parameters(
-        config: Optional["PPLXQwen3Config"] = None,
-        device: Optional[torch.device] = None,
-    ) -> Tuple[torch.Tensor, float]:
-        """
-        Computes the inverse frequencies according to the original RoPE implementation
-        Args:
-            config: The model configuration.
-            device: The device to use for initialization of the inverse frequencies.
-        Returns:
-            Tuple of (inv_freq, attention_scaling), containing the inverse frequencies
-            for the RoPE embeddings and the post-processing scaling factor applied to
-            the computed cos/sin.
-        """
-        base = config.rope_parameters["rope_theta"]
-        dim = config.head_dim
-        attention_factor = 1.0  # Unused in default RoPE
-        # Compute the inverse frequencies
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
-        )
-        return inv_freq, attention_factor
-    def forward(self, x, position_ids):
-        # Expand inv_freq to match batch size
-        inv_freq_expanded = (
-            self.inv_freq[None, :, None]
-            .float()
-            .expand(position_ids.shape[0], -1, 1)
-            .to(x.device)
-        )
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Compute frequencies
-        device_type = (
-            x.device.type
-            if isinstance(x.device.type, str) and x.device.type != "mps"
-            else "cpu"
-        )
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (
-                inv_freq_expanded.float() @ position_ids_expanded.float()
-            ).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors."""
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
-    Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
-    Hidden states go from (batch, num_key_value_heads, seqlen, head_dim)
-    to (batch, num_attention_heads, seqlen, head_dim)
     """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-def eager_attention_forward(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: float,
-    dropout: float = 0.0,
-    training: bool = False,
-    num_key_value_groups: int = 1,
-    **kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Eager (vanilla) attention implementation.
-    Args:
-        query: (batch, num_heads, seq_len, head_dim)
-        key: (batch, num_kv_heads, seq_len, head_dim)
-        value: (batch, num_kv_heads, seq_len, head_dim)
-        attention_mask: (batch, 1, seq_len, seq_len)
-        scaling: attention scaling factor
-        dropout: dropout probability
-        training: whether in training mode
-        num_key_value_groups: number of query heads per key/value head (for GQA)
-    """
-    # Repeat k/v heads if using GQA
-    key_states = repeat_kv(key, num_key_value_groups)
-    value_states = repeat_kv(value, num_key_value_groups)
-    # Compute attention scores
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    # Apply attention mask
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
-    # Softmax and dropout
-    attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = F.dropout(attn_weights, p=dropout, training=training)
-    # Compute output
-    attn_output = torch.matmul(attn_weights, value_states)
-    return attn_output, attn_weights
-def sdpa_attention_forward(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: float,
-    dropout: float = 0.0,
-    training: bool = False,
-    num_key_value_groups: int = 1,
-    **kwargs,
-) -> Tuple[torch.Tensor, None]:
-    """
-    Scaled Dot Product Attention using PyTorch's native implementation.
-    Args:
-        query: (batch, num_heads, seq_len, head_dim)
-        key: (batch, num_kv_heads, seq_len, head_dim)
-        value: (batch, num_kv_heads, seq_len, head_dim)
-        attention_mask: (batch, 1, seq_len, seq_len) or None
-        scaling: attention scaling factor (handled internally by SDPA)
-        dropout: dropout probability
-        training: whether in training mode
-        num_key_value_groups: number of query heads per key/value head (for GQA)
-    """
-    # Repeat k/v heads if using GQA
-    key = repeat_kv(key, num_key_value_groups)
-    value = repeat_kv(value, num_key_value_groups)
-    # Convert attention mask for SDPA
-    # SDPA expects additive mask in shape (batch, num_heads, seq_len, seq_len) or broadcastable
-    attn_mask = None
-    if attention_mask is not None:
-        # attention_mask is (batch, 1, seq_len, seq_len)
-        # Broadcast to (batch, num_heads, seq_len, seq_len) by repeating
-        batch_size, _, seq_len, _ = attention_mask.shape
-        num_heads = query.shape[1]
-        # Expand to match num_heads
-        attn_mask = attention_mask.expand(batch_size, num_heads, seq_len, seq_len)
-    # PyTorch SDPA
-    attn_output = F.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=attn_mask,
-        dropout_p=dropout if training else 0.0,
-        is_causal=False,  # We handle masking explicitly for bidirectional
-        scale=scaling,
-    )
-    return attn_output, None
-# Mapping of attention implementation names to functions
-ATTENTION_IMPLEMENTATIONS = {
-    "eager": eager_attention_forward,
-    "sdpa": sdpa_attention_forward,
-}
-class PPLXQwen3Attention(nn.Module):
-    """
-    Multi-headed attention implementation compatible with transformers < 5.X
-    Supports multiple attention backends: eager, sdpa
-    """
-    def __init__(self, config, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = config.head_dim
-        self.num_attention_heads = config.num_attention_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = (
-            config.num_attention_heads // config.num_key_value_heads
-        )
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.q_proj = nn.Linear(
-            config.hidden_size,
-            config.num_attention_heads * self.head_dim,
-            bias=config.attention_bias,
-        )
-        self.k_proj = nn.Linear(
-            config.hidden_size,
-            config.num_key_value_heads * self.head_dim,
-            bias=config.attention_bias,
-        )
-        self.v_proj = nn.Linear(
-            config.hidden_size,
-            config.num_key_value_heads * self.head_dim,
-            bias=config.attention_bias,
-        )
-        self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim,
-            config.hidden_size,
-            bias=config.attention_bias,
-        )
-        self.q_norm = PPLXQwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
-        self.k_norm = PPLXQwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
-        # Select attention implementation
-        self._select_attention_implementation(config)
-    def _select_attention_implementation(self, config):
-        """Select the attention implementation based on config or availability."""
-        attn_impl = getattr(config, "attn_implementation", None)
-        if attn_impl is None:
-            # Auto-select: prefer faster implementations
-            if hasattr(F, "scaled_dot_product_attention"):
-                attn_impl = "sdpa"
-            else:
-                attn_impl = "eager"
-        if attn_impl not in ATTENTION_IMPLEMENTATIONS:
-            raise ValueError(
-                f"Unknown attention implementation: {attn_impl}. "
-                f"Available: {list(ATTENTION_IMPLEMENTATIONS.keys())}"
-            )
-        # Check availability
-        if attn_impl == "sdpa" and not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "sdpa requested but not available. Please use PyTorch >= 2.0"
-            )
-        self.attn_implementation = attn_impl
-        self.attn_function = ATTENTION_IMPLEMENTATIONS[attn_impl]
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-        # Project and reshape
-        query_states = self.q_norm(
-            self.q_proj(hidden_states).view(hidden_shape)
-        ).transpose(1, 2)
-        key_states = self.k_norm(
-            self.k_proj(hidden_states).view(hidden_shape)
-        ).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        # Apply rotary embeddings
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin
-        )
-        # Call the selected attention implementation
-        attn_output, attn_weights = self.attn_function(
-            query=query_states,
-            key=key_states,
-            value=value_states,
-            attention_mask=attention_mask,
-            scaling=self.scaling,
-            dropout=self.attention_dropout,
-            training=self.training,
-            num_key_value_groups=self.num_key_value_groups,
-        )
-        # Reshape and project output
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
-class PPLXQwen3DecoderLayer(nn.Module):
-    """Decoder layer implementation compatible with transformers < 5.X"""
-    def __init__(self, config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = PPLXQwen3Attention(config=config, layer_idx=layer_idx)
-        self.mlp = PPLXQwen3MLP(config)
-        self.input_layernorm = PPLXQwen3RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-        self.post_attention_layernorm = PPLXQwen3RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_embeddings=position_embeddings,
-        )
-        hidden_states = residual + hidden_states
-        # MLP
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states
-class PPLXQwen3PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
     config_class = PPLXQwen3Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = False
-    _no_split_modules = ["PPLXQwen3DecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-class PPLXQwen3Model(PPLXQwen3PreTrainedModel):
-    """
-    Qwen3 Model implementation compatible with transformers < 5.X.
-    Only supports bidirectional attention (no causal masking or caching).
-    """
     def __init__(self, config):
         super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.hidden_size, self.padding_idx
-        )
-        self.layers = nn.ModuleList(
-            [
-                PPLXQwen3DecoderLayer(config, layer_idx)
-                for layer_idx in range(config.num_hidden_layers)
-            ]
-        )
-        self.norm = PPLXQwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = PPLXQwen3RotaryEmbedding(config=config)
-        # Initialize weights and apply final processing
         self.post_init()
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        **kwargs,
-    ) -> BaseModelOutputWithPast:
-        # Get embeddings
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        batch_size, seq_length = inputs_embeds.shape[:2]
-        # Create position IDs if not provided
-        if position_ids is None:
-            position_ids = (
-                torch.arange(seq_length, device=inputs_embeds.device)
-                .unsqueeze(0)
-                .expand(batch_size, -1)
-            )
-        # Create bidirectional attention mask
-        # Transform from (batch_size, seq_length) to (batch_size, 1, seq_length, seq_length)
-        if attention_mask is not None:
-            # Expand attention mask to 4D
-            attention_mask = attention_mask[:, None, None, :].to(
-                dtype=inputs_embeds.dtype
-            )
-            attention_mask = (1.0 - attention_mask) * torch.finfo(
-                inputs_embeds.dtype
-            ).min
-            # Broadcast to full attention shape
-            attention_mask = attention_mask.expand(
-                batch_size, 1, seq_length, seq_length
-            )
-        else:
-            # No masking needed for bidirectional attention with no padding
-            attention_mask = torch.zeros(
-                (batch_size, 1, seq_length, seq_length),
-                dtype=inputs_embeds.dtype,
-                device=inputs_embeds.device,
-            )
-        # Get rotary embeddings
-        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
-        # Pass through decoder layers
-        hidden_states = inputs_embeds
-        for decoder_layer in self.layers:
-            hidden_states = decoder_layer(
-                hidden_states,
                 attention_mask=attention_mask,
-                position_embeddings=position_embeddings,
             )
-        # Final norm
-        hidden_states = self.norm(hidden_states)
-        return BaseModelOutputWithPast(last_hidden_state=hidden_states)
-class PPLXQwen3ContextualModel(PPLXQwen3PreTrainedModel):
-    """
-    Qwen3 model with contextual encoding support for late chunking.
-    This model extends PPLXQwen3Model with an encode() method that supports both
-    standard encoding (list[str]) and contextual encoding (list[list[str]]) with late chunking.
-    """
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = PPLXQwen3Model(config)
-        self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
-        self._flexible_quantizer = FlexibleQuantizer()
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        **kwargs,
-    ) -> BaseModelOutputWithPast:
-        """Forward pass through the model."""
-        return self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
             **kwargs,
         )
-    @staticmethod
-    def mean_pooling(
-        token_embeddings: torch.Tensor, attention_mask: torch.Tensor
-    ) -> torch.Tensor:
-        """Apply mean pooling to token embeddings."""
-        input_mask_expanded = (
-            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        )
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
-            input_mask_expanded.sum(1), min=1e-9
-        )
-    @torch.inference_mode()
-    def encode(
-        self,
-        documents: list[list[str]],
-        batch_size: int = 32,
-        show_progress_bar: bool = False,
-        device: str | torch.device | None = None,
-        normalize_embeddings: bool = False,
-        convert_to_numpy: bool = True,
-        quantization: Literal["int8", "binary"] = "int8",
-    ) -> list[np.ndarray] | list[torch.Tensor]:
-        """
-        Encode documents with late chunking (contextual embeddings).
-        This model is designed specifically for contextual encoding and always expects
-        documents as nested lists where each document is a list of text chunks.
-        The encoding process:
-        1. Concatenate chunks with separator tokens
-        2. Run forward pass to get token embeddings
-        3. Extract and pool individual chunk embeddings (late chunking)
-        4. Apply quantization (Int8 or binary, always enabled)
-        5. Normalize embeddings if requested (applied after quantization)
-        6. Convert to numpy or return as tensors
-        Args:
-            documents: List of documents, where each document is a list of text chunks.
-                Example: [["chunk1", "chunk2"], ["chunk1", "chunk2", "chunk3"]]
-            batch_size: Batch size for encoding
-            show_progress_bar: Show progress bar during encoding
-            device: Device to use for computation (defaults to model's device)
-            normalize_embeddings: Normalize embeddings to unit length (applied after quantization)
-            convert_to_numpy: If True, returns list[np.ndarray], otherwise list[torch.Tensor]
-            quantization: Quantization type to apply. Options:
-                - "int8": Int8 tanh quantization (default)
-                - "binary": Binary tanh quantization
-        Returns:
-            List of numpy arrays or tensors (preserves document structure).
-            Each element has shape (n_chunks, hidden_dim).
-            embeddings[0].shape = (2, 1024), embeddings[1].shape = (3, 1024)
-            Output type depends on quantization method:
-            - Int8: int8 values in range [-128, 127]
-            - Binary: float values -1.0 or 1.0
-        """
-        if not isinstance(documents, list) or not all(
-            isinstance(doc, list) for doc in documents
-        ):
-            raise TypeError(
-                "Input 'documents' must be a list of lists of strings for contextual encoding."
-            )
-        if quantization not in ["int8", "binary"]:
-            raise ValueError(
-                f"Unsupported quantization type: '{quantization}'. "
-                f"Supported types are: 'int8', 'binary'. "
-                f"Got: {type(quantization).__name__} = '{quantization}'"
-            )
-        self.eval()
-        if device is None:
-            device = next(self.parameters()).device
-        all_embeddings = []
-        range_iter = range(0, len(documents), batch_size)
-        if show_progress_bar:
-            try:
-                from tqdm import tqdm
-                range_iter = tqdm(range_iter, desc="Encoding documents")
-            except ImportError:
-                pass
-        for i in range_iter:
-            batch_docs = documents[i : i + batch_size]
-            doc_strings = [
-                self.tokenizer.sep_token.join(chunks) for chunks in batch_docs
-            ]
-            inputs = self.tokenizer(
-                doc_strings,
-                padding=True,
-                truncation=True,
-                return_tensors="pt",
-            )
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            outputs = self.forward(**inputs)
-            token_embeddings = outputs.last_hidden_state
-            batch_chunk_embeddings = self._extract_chunks_from_concatenated(
-                input_ids=inputs["input_ids"],
-                token_embeddings=token_embeddings,
-                attention_mask=inputs["attention_mask"],
-            )
-            batch_chunk_embeddings = [
-                torch.stack([chunk for chunk in doc_chunks], dim=0)
-                for doc_chunks in batch_chunk_embeddings
-            ]
-            batch_chunk_embeddings = [
-                self._flexible_quantizer(
-                    {"sentence_embedding": emb}, quantization=quantization
-                )["sentence_embedding"]
-                for emb in batch_chunk_embeddings
-            ]
-            if normalize_embeddings:
-                batch_chunk_embeddings = [
-                    torch.nn.functional.normalize(emb, p=2, dim=-1)
-                    for emb in batch_chunk_embeddings
-                ]
-            batch_chunk_embeddings = [emb.cpu() for emb in batch_chunk_embeddings]
-            all_embeddings.extend(batch_chunk_embeddings)
-        if convert_to_numpy:
-            all_embeddings = [emb.numpy() for emb in all_embeddings]
-        return all_embeddings
-    def _extract_chunks_from_concatenated(
-        self,
-        input_ids: torch.Tensor,
-        token_embeddings: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> list[list[torch.Tensor]]:
-        """
-        Extract individual chunk embeddings from concatenated sequence using late chunking.
-        This method splits concatenated sequences like "[chunk1][SEP][chunk2][SEP]..."
-        back into individual chunk embeddings by finding SEP token positions.
-        Args:
-            input_ids: Token IDs (batch_size, seq_len)
-            token_embeddings: Token embeddings (batch_size, seq_len, hidden_dim)
-            attention_mask: Attention mask (batch_size, seq_len)
-        Returns:
-            list[list[torch.Tensor]]: List of documents, each containing list of chunk embeddings
-        Note:
-            The sep_token_id is retrieved from self.tokenizer.sep_token_id.
-            Common values: Qwen2=151643, BERT=102, varies by tokenizer.
-        """
-        sep_token_id = self.tokenizer.sep_token_id
-        batch_size = input_ids.shape[0]
-        all_doc_chunks = []
-        for batch_idx in range(batch_size):
-            # non-pad sep tokens
-            valid_positions = attention_mask[batch_idx].bool()
-            sep_positions = (
-                (input_ids[batch_idx] == sep_token_id) & valid_positions
-            ).nonzero(as_tuple=True)[0]
-            chunk_embeddings = []
-            start_pos = 0
-            for sep_pos in sep_positions:
-                chunk_tokens = token_embeddings[batch_idx, start_pos:sep_pos]
-                chunk_mask = attention_mask[batch_idx, start_pos:sep_pos]
-                chunk_emb = self.mean_pooling(
-                    chunk_tokens.unsqueeze(0), chunk_mask.unsqueeze(0)
-                ).squeeze(0)
-                chunk_embeddings.append(chunk_emb)
-                start_pos = sep_pos + 1
-            # Handle the last chunk (after the last SEP token)
-            last_valid_pos = attention_mask[batch_idx].sum().item()
-            chunk_tokens = token_embeddings[batch_idx, start_pos:last_valid_pos]
-            chunk_mask = attention_mask[batch_idx, start_pos:last_valid_pos]
-            if chunk_mask.sum() > 0:
-                chunk_emb = self.mean_pooling(
-                    chunk_tokens.unsqueeze(0), chunk_mask.unsqueeze(0)
-                ).squeeze(0)
-            else:
-                # Empty chunk - create zero embedding
-                chunk_emb = torch.zeros(
-                    token_embeddings.shape[-1],
-                    device=token_embeddings.device,
-                    dtype=token_embeddings.dtype,
-                )
-            chunk_embeddings.append(chunk_emb)
-            all_doc_chunks.append(chunk_embeddings)
-        return all_doc_chunks
-# Register for AutoModel
-PPLXQwen3Model.register_for_auto_class("AutoModel")
-PPLXQwen3ContextualModel.register_for_auto_class("AutoModel")
-__all__ = [
-    "PPLXQwen3Config",
-    "PPLXQwen3Model",
-    "PPLXQwen3PreTrainedModel",
-    "PPLXQwen3ContextualModel",
-    "PPLXQwen3RMSNorm",
-    "PPLXQwen3MLP",
-    "PPLXQwen3RotaryEmbedding",
-    "PPLXQwen3Attention",
-    "PPLXQwen3DecoderLayer",
-]

+from typing import Callable
 import torch
+from transformers import Qwen3Model
+from transformers.cache_utils import Cache
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs
 from .configuration import PPLXQwen3Config
+# From modeling_t5gemma.py
+def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable:
     """
+    This creates bidirectional attention mask.
     """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        if attention_mask is None:
+            return torch.ones((), dtype=torch.bool)
+        return attention_mask[batch_idx, kv_idx].to(torch.bool)
+    return inner_mask
+class PPLXQwen3Model(Qwen3Model):
+    _supports_flash_attn = True
+    _supports_sdpa = True
     config_class = PPLXQwen3Config
     def __init__(self, config):
         super().__init__(config)
         self.post_init()
+    def post_init(self):
+        super().post_init()
+        # Override to set all layers to non-causal attention. This'll work with attn_implementation="flash_attention_2" or "sdpa"
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
     def forward(
         self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+            input_ids = None
+        # We construct a dummy tensor imitating initial positions
+        dummy_cache_position = torch.arange(
+            inputs_embeds.shape[1], device=inputs_embeds.device, dtype=torch.long
+        )
+        attention_mask = {
+            "full_attention": create_causal_mask(
+                config=self.config,
+                input_embeds=inputs_embeds,
                 attention_mask=attention_mask,
+                cache_position=dummy_cache_position,
+                past_key_values=None,
+                position_ids=position_ids,
+                or_mask_function=bidirectional_mask_function(attention_mask),
             )
+        }
+        outputs = super().forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
+            past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
             **kwargs,
         )
+        return outputs

st_quantize.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 from typing import Literal
 class Quantizer(torch.nn.Module):
@@ -65,7 +66,7 @@ class BinaryTanhQuantizer(Quantizer):
         return torch.where(x >= 0, 1.0, -1.0)
-class FlexibleQuantizer(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
@@ -75,6 +76,7 @@ class FlexibleQuantizer(torch.nn.Module):
         self,
         features: dict[str, torch.Tensor],
         quantization: Literal["binary", "int8"] = "int8",
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
@@ -91,5 +93,17 @@ class FlexibleQuantizer(torch.nn.Module):
         return features
     @classmethod
-    def load(cls, input_path: str):
         return cls()

 import torch
 from typing import Literal
+from sentence_transformers.models import Module
 class Quantizer(torch.nn.Module):
         return torch.where(x >= 0, 1.0, -1.0)
+class FlexibleQuantizer(Module):
     def __init__(self):
         super().__init__()
         self._int8_quantizer = Int8TanhQuantizer()
         self,
         features: dict[str, torch.Tensor],
         quantization: Literal["binary", "int8"] = "int8",
+        **kwargs
     ) -> dict[str, torch.Tensor]:
         if quantization == "int8":
             features["sentence_embedding"] = self._int8_quantizer(
         return features
     @classmethod
+    def load(
+        cls,
+        model_name_or_path: str,
+        subfolder: str = "",
+        token: bool | str | None = None,
+        cache_folder: str | None = None,
+        revision: str | None = None,
+        local_files_only: bool = False,
+        **kwargs,
+    ):
         return cls()
+    def save(self, output_path: str, *args, **kwargs) -> None:
+        return