travis-xia commited on Jul 26, 2025

Commit

3080ac4

verified ·

1 Parent(s): 2586bbc

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
checkpoint-15200/model-00001-of-00004.safetensors +3 -0
checkpoint-15200/model-00002-of-00004.safetensors +3 -0
checkpoint-15200/model-00003-of-00004.safetensors +3 -0
checkpoint-15200/model-00004-of-00004.safetensors +3 -0
checkpoint-15200/modeling_qwen2_mla.py +218 -0
checkpoint-15200/rng_state_0.pth +3 -0
checkpoint-15200/rng_state_1.pth +3 -0
checkpoint-15200/rng_state_10.pth +3 -0
checkpoint-15200/rng_state_11.pth +3 -0
checkpoint-15200/rng_state_12.pth +3 -0
checkpoint-15200/rng_state_13.pth +3 -0
checkpoint-15200/rng_state_14.pth +3 -0
checkpoint-15200/rng_state_15.pth +3 -0
checkpoint-15200/rng_state_2.pth +3 -0
checkpoint-15200/rng_state_3.pth +3 -0
checkpoint-15200/rng_state_4.pth +3 -0
checkpoint-15200/rng_state_5.pth +3 -0
checkpoint-15200/rng_state_6.pth +3 -0
checkpoint-15200/rng_state_7.pth +3 -0
checkpoint-15200/rng_state_8.pth +3 -0
checkpoint-15200/rng_state_9.pth +3 -0
checkpoint-15200/special_tokens_map.json +31 -0
checkpoint-15200/tokenizer.json +3 -0
checkpoint-15200/training_args.bin +3 -0
checkpoint-15386/configuration_qwen2_mla.py +24 -0
checkpoint-15386/merges.txt +0 -0
checkpoint-15386/model-00001-of-00004.safetensors +3 -0
checkpoint-15386/model-00002-of-00004.safetensors +3 -0
checkpoint-15386/model-00003-of-00004.safetensors +3 -0
checkpoint-15386/model-00004-of-00004.safetensors +3 -0
checkpoint-15386/rng_state_0.pth +3 -0
checkpoint-15386/rng_state_1.pth +3 -0
checkpoint-15386/rng_state_10.pth +3 -0
checkpoint-15386/rng_state_11.pth +3 -0
checkpoint-15386/rng_state_12.pth +3 -0
checkpoint-15386/rng_state_13.pth +3 -0
checkpoint-15386/rng_state_14.pth +3 -0
checkpoint-15386/rng_state_15.pth +3 -0
checkpoint-15386/rng_state_2.pth +3 -0
checkpoint-15386/rng_state_3.pth +3 -0
checkpoint-15386/rng_state_4.pth +3 -0
checkpoint-15386/rng_state_5.pth +3 -0
checkpoint-15386/rng_state_6.pth +3 -0
checkpoint-15386/rng_state_7.pth +3 -0
checkpoint-15386/rng_state_8.pth +3 -0
checkpoint-15386/rng_state_9.pth +3 -0
checkpoint-15386/tokenizer.json +3 -0
checkpoint-15386/tokenizer_config.json +215 -0
checkpoint-15386/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-15386/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-15200/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint-15200/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:949869dc87408865accac4b7f107d22c440eb264fd656fbad1f8668560306e53
+size 4997418264

checkpoint-15200/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3631ec046f0f1a318346107c6531a77ab442ec9264de1ca39c4858a32b68955a
+size 4999754984

checkpoint-15200/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d09060fcb08ce9cca2fdf28b8f68ef24b1774b230e5e7155ee7f361c08066236
+size 4998910488

checkpoint-15200/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03847a6be440de6f82575fe8df560651c519c50eaf09572846272518009ad29f
+size 1324572984

checkpoint-15200/modeling_qwen2_mla.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers.cache_utils import Cache
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Model,
+    Qwen2DecoderLayer,
+    Qwen2PreTrainedModel,
+    Qwen2ForCausalLM
+)
+from .configuration_qwen2_mla import Qwen2MLAConfig
+from transformers.models.gemma2.modeling_gemma2 import (
+    eager_attention_forward,    # for supporting softcap
+    logger
+)
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
+    apply_rotary_pos_emb_interleave,
+    DeepseekV3RMSNorm
+)
+class MLAAttention(nn.Module):
+    """
+    Modified from `transformers.models.llama.modeling_deepseek_v3.DeepseekV3Attention`
+    add support for attention bias and softcapping
+    """
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.qk_head_dim = config.qk_head_dim
+        self.qk_latent_layernorm = getattr(config, "qk_latent_layernorm", True)
+        self.is_causal = True
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=config.attention_bias)
+        else:
+            self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=False)
+            if self.qk_latent_layernorm:
+                self.q_a_layernorm = DeepseekV3RMSNorm(self.q_lora_rank)
+            self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=config.attention_bias)
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        if self.qk_latent_layernorm:
+            self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            config.hidden_size,
+            bias=False,
+        )
+        self.scaling = self.qk_head_dim**-0.5
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+        if self.q_lora_rank is None:
+            q_states = self.q_proj(hidden_states)
+        elif self.qk_latent_layernorm:
+            q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        else:
+            q_states = self.q_b_proj(self.q_a_proj(hidden_states))
+        q_states = q_states.view(query_shape).transpose(1, 2)
+        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        if self.qk_latent_layernorm:
+            k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+        else:
+            k_pass = self.kv_b_proj(k_pass).view(key_shape).transpose(1, 2)
+        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        cos, sin = position_embeddings
+        q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+        attention_interface = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            softcap=getattr(self.config, "attn_logit_softcapping", None),
+            **kwargs,
+        )
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Qwen2MLADecoderLayer(Qwen2DecoderLayer):
+    """
+    Qwen2 decoder layer with MLA (Multi-Head Latent Attention) instead of standard attention.
+    This class inherits from Qwen2DecoderLayer and only replaces the self_attn component.
+    """
+    def __init__(self, config: Qwen2MLAConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        # Replace the standard Qwen2 attention with MLA attention
+        self.self_attn = MLAAttention(config, layer_idx)
+class Qwen2MLAPreTrainedModel(Qwen2PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models for Qwen2 with MLA attention.
+    """
+    config_class = Qwen2MLAConfig
+    _no_split_modules = ["Qwen2MLADecoderLayer"]
+class Qwen2MLAModel(Qwen2MLAPreTrainedModel, Qwen2Model):
+    """
+    The Qwen2 model with MLA attention layers.
+    This model inherits from both Qwen2MLAPreTrainedModel and Qwen2Model,
+    replacing the standard Qwen2 decoder layers with MLA-enabled ones.
+    """
+    def __init__(self, config: Qwen2MLAConfig):
+        super().__init__(config)
+        # Replace the layers with MLA-enabled decoder layers
+        self.layers = nn.ModuleList(
+            [Qwen2MLADecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+class Qwen2MLAForCausalLM(Qwen2MLAPreTrainedModel, Qwen2ForCausalLM):
+    """
+    The Qwen2 model with MLA attention for causal language modeling.
+    This model can be used for text generation tasks, providing the same
+    interface as Qwen2ForCausalLM but with MLA attention mechanism.
+    """
+    def __init__(self, config: Qwen2MLAConfig):
+        super().__init__(config)
+        # Replace the base model with the MLA version
+        self.model = Qwen2MLAModel(config)
+# Export the main classes for external use
+__all__ = [
+    "Qwen2MLAForCausalLM",
+    "Qwen2MLAModel",
+    "Qwen2MLAPreTrainedModel",
+]

checkpoint-15200/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
+size 15984

checkpoint-15200/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
+size 15984

checkpoint-15200/rng_state_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d4c67c1d0ec6b889370f634ff29f584f829acaeaf69196a69304fdba936f9d7
+size 15997

checkpoint-15200/rng_state_11.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afd0c4e2ac5f61d5f4df5d9d5783018b7937173b4afdbb92816c2668982cb351
+size 15997

checkpoint-15200/rng_state_12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d1c7f561c5512193e8053ecf371e4fef647a8503481f2dad76332ddbe164fd
+size 15997

checkpoint-15200/rng_state_13.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7604a3cfaf0e3c3f2aa2b3547660615db82205d4158951bb90f69ffc560ed179
+size 15997

checkpoint-15200/rng_state_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d984186a3bc06445d6c077fcbf64da96d47d3ed704e68f0212045efb4b91dbb
+size 15997

checkpoint-15200/rng_state_15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b3945ba3535361fa13a8460a70ebbd44ece2514a1c9dcb76abfe6f54a775c60
+size 15997

checkpoint-15200/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
+size 15984

checkpoint-15200/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
+size 15984

checkpoint-15200/rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
+size 15984

checkpoint-15200/rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
+size 15984

checkpoint-15200/rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
+size 15984

checkpoint-15200/rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
+size 15984

checkpoint-15200/rng_state_8.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4b75777b43df70dca87f91deed2e613941188792f244d5eeb06da8ab038bd36
+size 15984

checkpoint-15200/rng_state_9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f9eb93d0ac349ba8b2271c417e5fcc4b70447a424f3f861c3338fbd99a48f4
+size 15984

checkpoint-15200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-15200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:540b7fbf60b80e8293593a86960df91d2263723d69107ffc1afc89a7c08cda12
+size 11422162

checkpoint-15200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4555e0e31481452ef30287c4b917c3310e94b6234d89939282be3a2ae6d6bc8
+size 8120

checkpoint-15386/configuration_qwen2_mla.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+class Qwen2MLAConfig(Qwen2Config):
+    def __init__(
+        self,
+        *args,
+        kv_lora_rank=512,
+        q_lora_rank=None,
+        qk_rope_head_dim=64,
+        qk_nope_head_dim=128,
+        v_head_dim=128,
+        qk_latent_layernorm=True,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_head_dim = qk_rope_head_dim + qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_latent_layernorm = qk_latent_layernorm

checkpoint-15386/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-15386/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0635097b8c9df0afb076290fd5ccc2debb7c4fb641a60cbfc48dc12224c37c0b
+size 4997418264

checkpoint-15386/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02cc2dffbf6c47ef6826f4448f681ad603b7ca77b460b92f6d5bed9857090387
+size 4999754984

checkpoint-15386/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:354d58132fab1e43d5928978ba187b5b7ecb336b8042cfde765fb37aaaea46b3
+size 4998910488

checkpoint-15386/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:294b8199bc7db26e25ff052f32279d51fa4efb14b5d08e13d2f2d6a843c98ac4
+size 1324572984

checkpoint-15386/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
+size 15984

checkpoint-15386/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
+size 15984

checkpoint-15386/rng_state_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d4c67c1d0ec6b889370f634ff29f584f829acaeaf69196a69304fdba936f9d7
+size 15997

checkpoint-15386/rng_state_11.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afd0c4e2ac5f61d5f4df5d9d5783018b7937173b4afdbb92816c2668982cb351
+size 15997

checkpoint-15386/rng_state_12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d1c7f561c5512193e8053ecf371e4fef647a8503481f2dad76332ddbe164fd
+size 15997

checkpoint-15386/rng_state_13.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7604a3cfaf0e3c3f2aa2b3547660615db82205d4158951bb90f69ffc560ed179
+size 15997

checkpoint-15386/rng_state_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d984186a3bc06445d6c077fcbf64da96d47d3ed704e68f0212045efb4b91dbb
+size 15997

checkpoint-15386/rng_state_15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b3945ba3535361fa13a8460a70ebbd44ece2514a1c9dcb76abfe6f54a775c60
+size 15997

checkpoint-15386/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
+size 15984

checkpoint-15386/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
+size 15984

checkpoint-15386/rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
+size 15984

checkpoint-15386/rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
+size 15984

checkpoint-15386/rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
+size 15984

checkpoint-15386/rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
+size 15984

checkpoint-15386/rng_state_8.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4b75777b43df70dca87f91deed2e613941188792f244d5eeb06da8ab038bd36
+size 15984

checkpoint-15386/rng_state_9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f9eb93d0ac349ba8b2271c417e5fcc4b70447a424f3f861c3338fbd99a48f4
+size 15984

checkpoint-15386/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:540b7fbf60b80e8293593a86960df91d2263723d69107ffc1afc89a7c08cda12
+size 11422162

checkpoint-15386/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "max_length": 256,
+  "model_max_length": 32768,
+  "pad_to_multiple_of": null,
+  "pad_token": "<|endoftext|>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "stride": 0,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": null
+}

checkpoint-15386/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4555e0e31481452ef30287c4b917c3310e94b6234d89939282be3a2ae6d6bc8
+size 8120