Upload custom model with source code and tokenizer

Browse files

Files changed (8) hide show

common.py +172 -0
config.json +43 -0
generation_config.json +5 -0
model.safetensors +3 -0
qwen.py +600 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +53 -0

common.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+class CastedLinear(nn.Linear):
+    def forward(self, x: torch.FloatTensor):
+        if self.weight.device.type == "meta":
+            return nn.functional.linear(x, self.weight)
+        return nn.functional.linear(x, self.weight.type_as(x))
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        device: torch.device,
+        dtype: torch.dtype | None = None,
+    ):
+        factory_kwargs = dict(device=device, dtype=dtype)
+        super().__init__()
+        self.fc1 = CastedLinear(embedding_dim, hidden_dim, bias=False, **factory_kwargs)
+        self.fc2 = CastedLinear(embedding_dim, hidden_dim, bias=False, **factory_kwargs)
+        self.fc3 = CastedLinear(hidden_dim, embedding_dim, bias=False, **factory_kwargs)
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        x_fc1 = self.fc1(x)
+        x_fc2 = self.fc2(x)
+        x = nn.functional.silu(x_fc1) * x_fc2
+        x = self.fc3(x)
+        return x
+class MoEFeedForward(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        num_experts_per_token: int,
+        num_experts: int,
+        device: torch.device,
+        dtype: torch.dtype | None = None,
+    ):
+        assert num_experts > 0, "num_experts should be greater than zero"
+        assert num_experts >= num_experts_per_token > 0, (
+            "num_experts_per_token should be greater than zero and less than or equal to num_experts"
+        )
+        super().__init__()
+        self.num_experts_per_token = num_experts_per_token
+        self.num_experts = num_experts
+        meta_device = torch.device("meta")
+        self.gate = CastedLinear(
+            embedding_dim, num_experts, bias=False, device=device, dtype=dtype
+        )
+        self.ff = nn.ModuleList(
+            [
+                FeedForward(
+                    embedding_dim,
+                    hidden_dim,
+                    device=meta_device,
+                    dtype=dtype,
+                )
+                for _ in range(num_experts)
+            ]
+        )
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        scores = self.gate(x)
+        topk_scores, topk_indices = torch.topk(
+            scores, self.num_experts_per_token, dim=-1
+        )
+        topk_probs = torch.softmax(topk_scores, dim=-1)
+        expert_outputs = []
+        for i in range(self.num_experts):
+            out = self.ff[i](x)
+            expert_outputs.append(out.unsqueeze(-2))
+        expert_outputs = torch.cat(expert_outputs, dim=-2)
+        gating_probs = torch.zeros_like(scores)
+        for i in range(self.num_experts_per_token):
+            indices = topk_indices[..., i : i + 1]
+            prob = topk_probs[..., i : i + 1]
+            gating_probs.scatter_(dim=-1, index=indices, src=prob)
+        gating_probs = gating_probs.unsqueeze(-1)
+        y = (gating_probs * expert_outputs).sum(dim=-2)
+        return y
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-6,
+        bias: bool = False,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        factory_kwargs = dict(device=device, dtype=dtype)
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(embedding_dim, **factory_kwargs))
+        self.eps = eps
+        self.shift = (
+            nn.Parameter(torch.zeros(embedding_dim, **factory_kwargs)) if bias else None
+        )
+        self.dtype = dtype
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        input_dtype = x.dtype
+        variance = x.to(self.dtype).pow(2).mean(dim=-1, keepdim=True)
+        norm_x = x * torch.rsqrt(variance + self.eps)
+        norm_x = norm_x * self.scale
+        if self.shift is not None:
+            norm_x = norm_x + self.shift
+        return norm_x.to(input_dtype)
+def compute_rope_params(
+    head_dim: int,
+    theta_base: int = 10_000,
+    context_length: int = 4096,
+    dtype: Optional[torch.dtype] = torch.float32,
+    device: Optional[torch.device] = None,
+) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+    assert head_dim % 2 == 0, "Embedding dim (head_dim) must be even"
+    inv_freq = 1.0 / (
+        theta_base
+        ** (
+            torch.arange(0, head_dim, 2, dtype=dtype, device=device)[
+                : head_dim // 2
+            ].float()
+            / head_dim
+        )
+    )
+    positions = torch.arange(context_length, dtype=dtype, device=device)
+    angles = positions[:, None] * inv_freq[None, :]
+    angles = torch.cat([angles, angles], dim=1)
+    cos = torch.cos(angles)
+    sin = torch.sin(angles)
+    return cos, sin
+def apply_rope(
+    x: torch.FloatTensor,
+    cos: torch.FloatTensor,
+    sin: torch.FloatTensor,
+    offset: int = 0,
+) -> torch.FloatTensor:
+    assert x.dim() == 4, "expected tensor of dimension 3 (B, NH, S, H)"
+    _, _, seq_len, head_dim = x.shape
+    assert head_dim % 2 == 0, "head_dim must be even"
+    x1 = x[..., : head_dim // 2]
+    x2 = x[..., : head_dim // 2 :]
+    cos = cos[offset : offset + seq_len, :].unsqueeze(0).unsqueeze(0)
+    sin = sin[offset : offset + seq_len, :].unsqueeze(0).unsqueeze(0)
+    rotated = torch.cat((-x2, x1), dim=-1)
+    x_rotated = (x * cos) + (rotated * sin)
+    x_rotated = x_rotated.type_as(x)
+    return x_rotated

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "FlexQwenForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModel": "qwen.FlexQwen",
+    "AutoModelForCausalLM": "qwen.FlexQwenForCausalLM",
+    "AutoModelForSequenceClassification": "qwen.FlexQwenForSequenceClassification"
+  },
+  "cls_token_id": 1,
+  "context_length": 4096,
+  "embedding_dim": 1024,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_dim": 2048,
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 22016,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "moe_hidden_dim": 512,
+  "moe_num_experts": 0,
+  "moe_num_experts_per_token": -1,
+  "num_attention_heads": 8,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "num_kv_groups": 8,
+  "pad_token_id": 3,
+  "qk_norm": true,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 64000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "pad_token_id": 3,
+  "transformers_version": "4.51.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e340dba542c92d7d93cbfd27702a8e3d188af47e21cfe39873ea91228061e223
+size 1866802096

qwen.py ADDED Viewed

	@@ -0,0 +1,600 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, Qwen3Config, GenerationMixin
+from transformers.utils import ModelOutput
+from transformers.modeling_outputs import (
+    SequenceClassifierOutput,
+    CausalLMOutputWithPast,
+)
+from .common import (
+    FeedForward,
+    MoEFeedForward,
+    RMSNorm,
+    compute_rope_params,
+    apply_rope,
+    CastedLinear,
+)
+class FlexQwenConfig(Qwen3Config):
+    def __init__(
+        self,
+        vocab_size: int = 64000,
+        embedding_dim: int = 1024,
+        hidden_dim: int = 2048,
+        num_attention_heads: int = 8,
+        num_kv_groups: int = 8,
+        head_dim: int = 128,
+        qk_norm: bool = True,
+        moe_num_experts: int = 0,
+        moe_num_experts_per_token: int = -1,
+        moe_hidden_dim: int = 512,
+        num_hidden_layers: int = 32,
+        context_length: int = 1024,
+        rms_norm_eps: float = 1e-6,
+        rope_theta: int = 10000,
+        initializer_range: float = 0.02,
+        cls_token_id: int = 1,
+        pad_token_id: int = 3,
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            cls_token_id=cls_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        # Vocab & Embeddings
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        # Attention Mechanism
+        self.num_attention_heads = num_attention_heads
+        self.num_kv_groups = num_kv_groups
+        self.head_dim = head_dim
+        self.qk_norm = qk_norm
+        # Feed-Forward & MoE
+        self.moe_num_experts = moe_num_experts
+        self.moe_num_experts_per_token = moe_num_experts_per_token
+        self.moe_hidden_dim = moe_hidden_dim
+        # General Architecture
+        self.num_hidden_layers = num_hidden_layers
+        self.context_length = context_length
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        # Initialization
+        self.initializer_range = initializer_range
+        # Standard HF Config params
+        self.tie_word_embeddings = tie_word_embeddings
+class FlexQwenPreTrainedModel(PreTrainedModel):
+    config_class = FlexQwenConfig
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Embedding):
+            module.weight.data.uniform_(
+                -self.config.initializer_range, self.config.initializer_range
+            )
+        # elif isinstance(module, CastedLinear):
+        #     module.weight.data.uniform_()
+class GroupedQueryAttention(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        num_heads: int,
+        num_kv_groups: int,
+        head_dim: int | None = None,
+        qk_norm: bool = False,
+        rms_norm_eps: float = 1e-6,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        assert num_heads % num_kv_groups == 0, (
+            "num_heads must be divisible by num_kv_groups"
+        )
+        factory_kwargs = dict(device=device, dtype=dtype)
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_kv_groups = num_kv_groups
+        self.group_size = num_heads // num_kv_groups
+        if head_dim is None:
+            assert in_features % num_heads == 0, (
+                "input_dim must be divisible by num_heads"
+            )
+            head_dim = in_features // num_heads
+        self.head_dim = head_dim
+        self.out_features = num_heads * head_dim
+        self.wq = CastedLinear(
+            in_features, self.out_features, bias=False, **factory_kwargs
+        )
+        self.wkv = CastedLinear(
+            in_features, 2 * num_kv_groups * head_dim, bias=False, **factory_kwargs
+        )
+        self.out_proj = CastedLinear(
+            self.out_features, in_features, bias=False, **factory_kwargs
+        )
+        if qk_norm:
+            self.q_norm = RMSNorm(head_dim, eps=rms_norm_eps, **factory_kwargs)
+            self.k_norm = RMSNorm(head_dim, eps=rms_norm_eps, **factory_kwargs)
+        else:
+            self.q_norm = self.k_norm = None
+    def forward(
+        self,
+        x: torch.FloatTensor,
+        cos: torch.FloatTensor,
+        sin: torch.FloatTensor,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, tuple[torch.Tensor, torch.Tensor]]:
+        batch_size, num_tokens, _ = x.shape
+        query = self.wq(x)
+        key, value = self.wkv(x).chunk(2, dim=-1)
+        query = query.view(
+            batch_size, num_tokens, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key = key.view(
+            batch_size, num_tokens, self.num_kv_groups, self.head_dim
+        ).transpose(1, 2)
+        value = value.view(
+            batch_size, num_tokens, self.num_kv_groups, self.head_dim
+        ).transpose(1, 2)
+        if self.q_norm:
+            query = self.q_norm(query)
+        if self.k_norm:
+            key = self.k_norm(key)
+        offset = 0
+        if cache_position is None:
+            kv_seq_len = key.shape[-2]
+            if past_key_value is not None:
+                kv_seq_len += past_key_value[0].shape[2]
+            offset = kv_seq_len - num_tokens
+        else:
+            offset = cache_position[0].item()
+        query = apply_rope(query, cos, sin, offset=offset)
+        key = apply_rope(key, cos, sin, offset=offset)
+        if past_key_value is not None:
+            past_key, past_value = past_key_value
+            key = torch.cat([past_key, key], dim=-2)
+            value = torch.cat([past_value, value], dim=-2)
+        present_key_value = (key, value)
+        attn_output = nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            enable_gqa=True,
+        )
+        out = self.out_proj(
+            attn_output.transpose(1, 2).reshape(
+                batch_size, num_tokens, self.out_features
+            )
+        )
+        return out, present_key_value
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        num_heads: int,
+        head_dim: int,
+        num_kv_groups: int,
+        qk_norm: int = False,
+        moe_num_experts_per_token: int = 8,
+        moe_num_experts: int = 0,
+        moe_hidden_dim: int = 128,
+        rms_norm_eps: float = 1e-6,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        factory_kwargs = dict(device=device, dtype=dtype)
+        super().__init__()
+        self.attn = GroupedQueryAttention(
+            in_features=embedding_dim,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            num_kv_groups=num_kv_groups,
+            qk_norm=qk_norm,
+            **factory_kwargs,
+        )
+        if moe_num_experts > 0:
+            self.ff = MoEFeedForward(
+                embedding_dim=embedding_dim,
+                hidden_dim=moe_hidden_dim,
+                num_experts_per_token=moe_num_experts_per_token,
+                num_experts=moe_num_experts,
+                **factory_kwargs,
+            )
+        else:
+            self.ff = FeedForward(
+                embedding_dim, hidden_dim=hidden_dim, **factory_kwargs
+            )
+        self.norm1 = RMSNorm(embedding_dim, eps=rms_norm_eps, **factory_kwargs)
+        self.norm2 = RMSNorm(embedding_dim, eps=rms_norm_eps, **factory_kwargs)
+    def forward(
+        self,
+        x: torch.FloatTensor,
+        cos: torch.FloatTensor,
+        sin: torch.FloatTensor,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor]]:
+        residual = x
+        x = self.norm1(x)
+        x, present_key_value = self.attn(
+            x,
+            cos,
+            sin,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            cache_position=cache_position,
+        )
+        x += residual
+        residual = x
+        x = self.norm2(x)
+        x = self.ff(x)
+        x += residual
+        return x, present_key_value
+@dataclass
+class FlexQwenOutputWithPast(ModelOutput):
+    last_hidden_state: torch.FloatTensor
+    past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None
+class FlexQwen(FlexQwenPreTrainedModel):
+    config_class = FlexQwenConfig
+    def __init__(
+        self,
+        config: FlexQwenConfig,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__(config)
+        self.embed = nn.Embedding(
+            config.vocab_size,
+            config.embedding_dim,
+            padding_idx=config.pad_token_id,
+            device=device,
+            dtype=dtype,
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                Transformer(
+                    embedding_dim=config.embedding_dim,
+                    hidden_dim=config.hidden_dim,
+                    num_heads=config.num_attention_heads,
+                    head_dim=config.head_dim,
+                    num_kv_groups=config.num_kv_groups,
+                    qk_norm=config.qk_norm,
+                    moe_num_experts_per_token=config.moe_num_experts_per_token,
+                    moe_num_experts=config.moe_num_experts,
+                    moe_hidden_dim=config.moe_hidden_dim,
+                    rms_norm_eps=config.rms_norm_eps,
+                    device=device,
+                    dtype=dtype,
+                )
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+        self.final_norm = RMSNorm(
+            config.embedding_dim, eps=config.rms_norm_eps, device=device, dtype=dtype
+        )
+        cos, sin = compute_rope_params(
+            head_dim=config.head_dim,
+            theta_base=config.rope_theta,
+            context_length=config.context_length,
+            dtype=dtype,
+            device=device,
+        )
+        self.register_buffer("cos", cos, persistent=False)
+        self.register_buffer("sin", sin, persistent=False)
+        self.config = config
+        self.current_pos = 0
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values: Optional[tuple[torch.FloatTensor, torch.FloatTensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        is_causal: bool = True,
+        return_dict: bool = True,
+    ) -> FlexQwenOutputWithPast:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Received both input_ids and input_embeds. Pass only one.")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Exactly one of input_ids, input_embds is required.")
+        if input_ids is not None:
+            if input_ids.dim() == 1:
+                input_ids = input_ids.unsqueeze(0)
+            x = self.embed(input_ids)
+        else:
+            x = inputs_embeds
+        seq_length = x.shape[1]
+        base_mask = torch.ones(
+            (seq_length, seq_length), dtype=torch.bool, device=x.device
+        )
+        if is_causal:
+            base_mask = torch.tril(base_mask)
+        else:
+            base_mask = ~base_mask
+        if attention_mask is not None:
+            padding_mask = (attention_mask == 0).unsqueeze(1).unsqueeze(2)
+            attention_mask = base_mask.unsqueeze(0).unsqueeze(1) | padding_mask
+        else:
+            attention_mask = base_mask.unsqueeze(0).unsqueeze(1)
+        next_kv_cache = [] if use_cache else None
+        for i, block in enumerate(self.transformer_blocks):
+            past_kv_cache_block = (
+                past_key_values[i]
+                if past_key_values is not None and len(past_key_values) > 0
+                else None
+            )
+            x, block_present_kv_cache = block(
+                x,
+                self.cos,
+                self.sin,
+                attention_mask=attention_mask,
+                past_key_value=past_kv_cache_block,
+                cache_position=cache_position,
+            )
+            if use_cache:
+                next_kv_cache.append(block_present_kv_cache)
+        x = self.final_norm(x)
+        output = FlexQwenOutputWithPast(
+            last_hidden_state=x,
+            past_key_values=tuple(next_kv_cache) if use_cache else None,
+        )
+        if not return_dict:
+            return output.to_tuple()
+        return output
+class FlexQwenForCausalLM(FlexQwenPreTrainedModel, GenerationMixin):
+    config_class = FlexQwenConfig
+    def __init__(
+        self,
+        config: FlexQwenConfig,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        **kwargs,
+    ):
+        super().__init__(config)
+        self.model = FlexQwen(config, device=device, dtype=dtype)
+        self.lm_head = CastedLinear(
+            config.embedding_dim,
+            config.vocab_size,
+            bias=False,
+            device=device,
+            dtype=dtype,
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: bool = True,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        outputs: FlexQwenOutputWithPast = self.model(
+            input_ids=input_ids,
+            is_causal=True,
+            use_cache=use_cache,
+            return_dict=True,
+            **kwargs,
+        )
+        logits = self.lm_head(outputs.last_hidden_state).to(torch.float32)
+        loss = None
+        if labels is not None:
+            if labels.dim() == 1:
+                labels = labels.unsqueeze(0)
+            loss = nn.functional.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                labels.view(-1),
+                ignore_index=-100,
+                reduction="sum" if self.training else "mean",
+            )
+        output = CausalLMOutputWithPast(
+            logits=logits,
+            loss=loss,
+            past_key_values=outputs.past_key_values if use_cache else None,
+        )
+        if not return_dict:
+            return output.to_tuple()
+        return output
+class FlexQwenForSequenceClassification(FlexQwenPreTrainedModel):
+    config_class = FlexQwenConfig
+    def __init__(
+        self,
+        config: FlexQwenConfig,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = FlexQwen(config, device=device, dtype=dtype)
+        self.score = CastedLinear(config.embedding_dim, self.num_labels, bias=False)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> SequenceClassifierOutput:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs: FlexQwenOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True,
+            **kwargs,
+        )
+        sequence_lengths = (
+            torch.eq(attention_mask, 1).int().argmax(-1)
+            if attention_mask is not None
+            else -1
+        )
+        hidden_states = outputs.last_hidden_state
+        pooled_states = hidden_states[
+            torch.arange(hidden_states.shape[0], device=hidden_states.device),
+            sequence_lengths,
+        ]
+        logits = self.score(pooled_states)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                logits.view(-1, self.num_labels),
+                labels.view(-1),
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+        )
+# def check_grad(is_causal):
+#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+#     config = FlexQwenConfig(vocab_size=2**10)
+#     model = FlexQwenForCausalLM(config=config, device=device)
+#     x = torch.randn(
+#         1,
+#         config.context_length,
+#         config.embedding_dim,
+#         requires_grad=True,
+#         device=device,
+#     )
+#     output = model(inputs_embeds=x, attention_mask=None, is_causal=is_causal)
+#     output = output.logits
+#     t = config.context_length // 2
+#     loss = output[:, t, :].sum()
+#     loss.backward()
+#     grad_up_to_t = x.grad[:, : t + 1, :]
+#     has_grad_past = torch.all(grad_up_to_t != 0)
+#     grad_after_t = x.grad[:, t + 1 :, :]
+#     has_grad_future = torch.any(grad_after_t != 0)
+#     print(f"{is_causal=} {has_grad_past=} {has_grad_future=}")
+# if __name__ == "__main__":
+#     device = torch.device("cuda:0")
+#     config = FlexQwenConfig(vocab_size=2**10)
+#     model_lm = FlexQwenForCausalLM(config=config, device=device)
+#     input_ids = torch.arange(
+#         start=0,
+#         end=config.context_length - 1,
+#         device=device,
+#     ).unsqueeze(0)
+#     labels_seq = torch.arange(
+#         start=1,
+#         end=config.context_length,
+#         device=device,
+#     ).unsqueeze(0)
+#     output_lm: FlexQwenOutputWithPast = model_lm(
+#         input_ids, labels=labels_seq, is_causal=True
+#     )
+#     print(f"LM Logits shape: {output_lm.logits.shape}")
+#     print(f"LM Loss: {output_lm.loss.item()}")
+#     config.num_labels = 3
+#     model_seq = FlexQwenForSequenceClassification(config=config, device=device)
+#     input_ids = torch.randint(0, config.vocab_size, (4, 16), device=device)
+#     attention_mask = torch.ones_like(input_ids)
+#     attention_mask[2, 10:] = 0
+#     labels_seq = torch.randint(0, config.num_labels, (4,), device=device)
+#     output_seq = model_seq(
+#         input_ids=input_ids, attention_mask=attention_mask, labels=labels_seq
+#     )
+#     print(f"Seq Logits shape: {output_seq.logits.shape}")
+#     print(f"Seq Loss: {output_seq.loss.item()}")
+#     peak_memory_allocated = torch.cuda.max_memory_allocated() // 1024 // 1024
+#     reserved_memory = torch.cuda.max_memory_reserved() // 1024 // 1024
+#     print(f"Peak memory allocated: {peak_memory_allocated} MB")
+#     print(f"Reserved memory: {reserved_memory} MB")
+#     check_grad(is_causal=True)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizer",
+  "unk_token": "[UNK]"
+}