OpenMOSE commited on Oct 9, 2025

Commit

e40ebbf

verified ·

1 Parent(s): ebb5416

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.ipynb_checkpoints/config-checkpoint.json +55 -0
.ipynb_checkpoints/config_old-checkpoint.json +50 -0
.ipynb_checkpoints/model.safetensors.index-checkpoint.json +687 -0
config.json +55 -0
config_old.json +50 -0
configuration_rwkv07aqwen3.py +238 -0
generation_config.json +6 -0
model-00001-of-00011.safetensors +3 -0
model-00002-of-00011.safetensors +3 -0
model-00003-of-00011.safetensors +3 -0
model-00004-of-00011.safetensors +3 -0
model-00005-of-00011.safetensors +3 -0
model-00006-of-00011.safetensors +3 -0
model-00007-of-00011.safetensors +3 -0
model-00008-of-00011.safetensors +3 -0
model-00009-of-00011.safetensors +3 -0
model-00010-of-00011.safetensors +3 -0
model-00011-of-00011.safetensors +3 -0
model.safetensors.index.json +687 -0
modeling_rwkv07aqwen3.py +1045 -0
special_tokens_map.json +46 -0
tokenization_rwkv07aqwen3.py +4 -0
tokenization_rwkv07aqwen3_fast.py +4 -0
tokenizer.json +0 -0
tokenizer_config.json +204 -0

.ipynb_checkpoints/config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "architectures": [
+    "RWKV07AQwen3ForCausalLM"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_rwkv07aqwen3.RWKV07AQwen3Config",
+      "AutoModelForCausalLM": "modeling_rwkv07aqwen3.RWKV07AQwen3ForCausalLM"
+  },
+  "description": "Hybrid-RWKV Strategically Interleaved RWKV-Attention",
+  "base_model": "ByteDance-Seed/Seed-OSS-36B-Instruct",
+  "model_revision": "alpha",
+  "transformer_layers":[3,8,14,20,25,30,35,39,43],
+  "rwkv_layers": [0,1,2,4,5,6,7,9,10,11,12,13,15,16,17,18,19,21,22,23,24,26,27,28,29,31,32,33,34,36,37,38,40,41,42],
+  "rwkv_architecture": "hxa07a",
+  "enable_qk_norm": false,
+  "nope_in_transformer": true,
+  "nope_in_rwkv": false,
+  "lora_rank_decay": 320,
+  "lora_rank_iclr":96,
+  "lora_rank_gate":320,
+  "use_rope":true,
+  "attention_bias": false,
+  "attention_out_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 100257,
+  "classifier_dropout": 0.0,
+  "eos_token_id": 100257,
+  "head_dim": 96,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.006,
+  "intermediate_size": 19648,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "max_position_embeddings": 98304,
+  "mlp_bias": false,
+  "model_type": "rwkv07aqwen3",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 44,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 8000000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.3",
+  "use_cache": true,
+  "vocab_size": 100352
+}

.ipynb_checkpoints/config_old-checkpoint.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "architectures": [
+    "RWKV07AQwen3ForCausalLM"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_rwkv07aqwen3.RWKV07AQwen3Config",
+      "AutoModelForCausalLM": "modeling_rwkv07aqwen3.RWKV07AQwen3ForCausalLM"
+  },
+  "description": "Hybrid-RWKV Strategically Interleaved RWKV-Attention",
+  "base_model": "ByteDance-Seed/Seed-OSS-36B-Instruct",
+  "model_revision": "alpha",
+  "transformer_layers":[3,7,11,15,19,23,27,31,35],
+  "rwkv_layers": [0,1,2,4,5,6,8,9,10,12,13,14,16,17,18,20,21,22,24,25,26,28,29,30,32,33,34],
+  "rwkv_architecture": "hxa07a",
+  "enable_qk_norm": true,
+  "nope_in_transformer": true,
+  "nope_in_rwkv": false,
+  "lora_rank_decay": 256,
+  "lora_rank_iclr":96,
+  "lora_rank_gate":256,
+  "use_rope":true,
+  "attention_bias": false,
+  "attention_out_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 36,
+  "model_type": "rwkv07aqwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

.ipynb_checkpoints/model.safetensors.index-checkpoint.json ADDED Viewed

	@@ -0,0 +1,687 @@

+{
+  "metadata": {
+    "total_size": 42444902400,
+    "format": "safetensors"
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00001-of-00011.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.a0": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.a1": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.a2": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.g1": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.g2": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.key.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.output.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.receptance.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.value.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.w0": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.w1": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.w2": "model-00001-of-00011.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.a0": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.a1": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.a2": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.g1": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.g2": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.key.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.output.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.receptance.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.value.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.w0": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.w1": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.w2": "model-00001-of-00011.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.a0": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.a1": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.a2": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.g1": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.g2": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.key.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.output.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.receptance.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.value.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.w0": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.w1": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.w2": "model-00002-of-00011.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.a0": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.a1": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.a2": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.g1": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.g2": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.key.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.output.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.receptance.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.value.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.w0": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.w1": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.w2": "model-00002-of-00011.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.a0": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.a1": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.a2": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.g1": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.g2": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.key.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.output.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.receptance.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.value.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.w0": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.w1": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.w2": "model-00002-of-00011.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.a0": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.a1": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.a2": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.g1": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.g2": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.key.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.output.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.receptance.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.value.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.w0": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.w1": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.w2": "model-00004-of-00011.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.a0": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.a1": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.a2": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.g1": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.g2": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.key.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.output.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.receptance.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.value.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.w0": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.w1": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.w2": "model-00004-of-00011.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.a0": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.a1": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.a2": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.g1": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.g2": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.key.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.output.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.receptance.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.value.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.w0": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.w1": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.w2": "model-00004-of-00011.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.a0": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.a1": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.a2": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.g1": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.g2": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.key.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.output.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.receptance.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.value.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.w0": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.w1": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.w2": "model-00006-of-00011.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.a0": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.a1": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.a2": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.g1": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.g2": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.key.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.output.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.receptance.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.value.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.w0": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.w1": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.w2": "model-00006-of-00011.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.a0": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.a1": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.a2": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.g1": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.g2": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.key.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.output.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.receptance.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.value.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.w0": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.w1": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.w2": "model-00006-of-00011.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.a0": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.a1": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.a2": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.g1": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.g2": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.key.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.output.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.receptance.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.value.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.w0": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.w1": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.w2": "model-00008-of-00011.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.a0": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.a1": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.a2": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.g1": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.g2": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.key.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.output.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.receptance.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.value.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.w0": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.w1": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.w2": "model-00008-of-00011.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.a0": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.a1": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.a2": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.g1": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.g2": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.key.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.output.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.receptance.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.value.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.w0": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.w1": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.w2": "model-00008-of-00011.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.a0": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.a1": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.a2": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.g1": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.g2": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.key.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.output.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.receptance.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.value.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.w0": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.w1": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.w2": "model-00010-of-00011.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.a0": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.a1": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.a2": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.g1": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.g2": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.key.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.output.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.receptance.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.value.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.w0": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.w1": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.w2": "model-00010-of-00011.safetensors",
+    "model.layers.41.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.a0": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.a1": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.a2": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.g1": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.g2": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.key.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.output.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.receptance.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.value.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.w0": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.w1": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.w2": "model-00010-of-00011.safetensors",
+    "model.layers.42.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.42.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.42.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.42.mlp.up_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.post_attention_layernorm.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.a0": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.a1": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.a2": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.g1": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.g2": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.key.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.output.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.receptance.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.value.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.w0": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.w1": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.w2": "model-00011-of-00011.safetensors",
+    "model.layers.43.input_layernorm.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.mlp.down_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.mlp.gate_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.mlp.up_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.post_attention_layernorm.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.k_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.o_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.q_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.v_proj.weight": "model-00011-of-00011.safetensors",
+    "model.norm.weight": "model-00011-of-00011.safetensors"
+  }
+}

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "architectures": [
+    "RWKV07AQwen3ForCausalLM"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_rwkv07aqwen3.RWKV07AQwen3Config",
+      "AutoModelForCausalLM": "modeling_rwkv07aqwen3.RWKV07AQwen3ForCausalLM"
+  },
+  "description": "Hybrid-RWKV Strategically Interleaved RWKV-Attention",
+  "base_model": "ByteDance-Seed/Seed-OSS-36B-Instruct",
+  "model_revision": "alpha",
+  "transformer_layers":[3,8,14,20,25,30,35,39,43],
+  "rwkv_layers": [0,1,2,4,5,6,7,9,10,11,12,13,15,16,17,18,19,21,22,23,24,26,27,28,29,31,32,33,34,36,37,38,40,41,42],
+  "rwkv_architecture": "hxa07a",
+  "enable_qk_norm": false,
+  "nope_in_transformer": true,
+  "nope_in_rwkv": false,
+  "lora_rank_decay": 320,
+  "lora_rank_iclr":96,
+  "lora_rank_gate":320,
+  "use_rope":true,
+  "attention_bias": false,
+  "attention_out_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 100257,
+  "classifier_dropout": 0.0,
+  "eos_token_id": 100257,
+  "head_dim": 96,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.006,
+  "intermediate_size": 19648,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "max_position_embeddings": 98304,
+  "mlp_bias": false,
+  "model_type": "rwkv07aqwen3",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 44,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 8000000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.3",
+  "use_cache": true,
+  "vocab_size": 100352
+}

config_old.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "architectures": [
+    "RWKV07AQwen3ForCausalLM"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_rwkv07aqwen3.RWKV07AQwen3Config",
+      "AutoModelForCausalLM": "modeling_rwkv07aqwen3.RWKV07AQwen3ForCausalLM"
+  },
+  "description": "Hybrid-RWKV Strategically Interleaved RWKV-Attention",
+  "base_model": "ByteDance-Seed/Seed-OSS-36B-Instruct",
+  "model_revision": "alpha",
+  "transformer_layers":[3,7,11,15,19,23,27,31,35],
+  "rwkv_layers": [0,1,2,4,5,6,8,9,10,12,13,14,16,17,18,20,21,22,24,25,26,28,29,30,32,33,34],
+  "rwkv_architecture": "hxa07a",
+  "enable_qk_norm": true,
+  "nope_in_transformer": true,
+  "nope_in_rwkv": false,
+  "lora_rank_decay": 256,
+  "lora_rank_iclr":96,
+  "lora_rank_gate":256,
+  "use_rope":true,
+  "attention_bias": false,
+  "attention_out_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 36,
+  "model_type": "rwkv07aqwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

configuration_rwkv07aqwen3.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RWKV07AQwen3 model configuration"""
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class RWKV07AQwen3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RWKV07AQwen3Model`]. It is used to instantiate a
+    RWKV07AQwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-7B-beta [Qwen/Qwen3-7B-beta](https://huggingface.co/Qwen/Qwen3-7B-beta).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the RWKV07AQwen3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RWKV07AQwen3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        lora_rank_decay (`int`, *optional*):
+            The rank of the lora used to generate decay.
+        lora_rank_iclr (`int`, *optional*):
+            The rank of the lora used to generate the in-context learning rate.
+        lora_rank_value_residual_mix (`int`, *optional*):
+            The rank of the lora used to generate the value residual mix amount.
+        lora_rank_value_gate (`int`, *optional*):
+            The rank of the lora used to generate the gate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import RWKV07AQwen3Model, RWKV07AQwen3Config
+    >>> # Initializing a RWKV07AQwen3 style configuration
+    >>> configuration = RWKV07AQwen3Config()
+    >>> # Initializing a model from the RWKV07AQwen3-7B style configuration
+    >>> model = RWKV07AQwen3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "rwkv07aqwen3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        lora_rank_tokenshift=None,
+        lora_rank_decay=None,
+        lora_rank_iclr=None,
+        lora_rank_value_residual_mix=None,
+        lora_rank_value_key_mix=None,
+        lora_rank_gate=None,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        use_rope=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        first_attention_layer=9999,
+        first_post_attention_layer=9999,
+        attention_striping=1,
+        last_striping_layer=99999,
+        layer_types=None,
+        attention_dropout=0.0,
+        attention_bias=True,
+        attention_output_bias=False,
+        gate_rank_type=2,
+        balance_state=True,
+        groupnorm_att=False,
+        use_tokenshift=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        self.first_attention_layer = first_attention_layer
+        self.first_post_attention_layer = first_post_attention_layer
+        self.attention_striping = attention_striping
+        self.last_striping_layer = last_striping_layer
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.lora_rank_tokenshift = lora_rank_tokenshift
+        self.lora_rank_decay = lora_rank_decay
+        self.lora_rank_iclr = lora_rank_iclr
+        self.lora_rank_value_residual_mix = lora_rank_value_residual_mix
+        self.lora_rank_gate = lora_rank_gate
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_rope = use_rope
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        self.attention_bias = attention_bias
+        self.attention_output_bias = attention_output_bias
+        self.gate_rank_type = gate_rank_type
+        self.balance_state = balance_state
+        self.groupnorm_att = groupnorm_att
+        self.use_tokenshift = use_tokenshift
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 100257,
+  "eos_token_id": 100257,
+  "transformers_version": "4.50.3"
+}

model-00001-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9403f763937b03e608bf2487a7bd8ce111ba050d6ee3dc0397769a05f418afeb
+size 4290887656

model-00002-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02afbc9df512156a9f5f8431028ded0f9c969fd0635e28b71eb3c7d147fc96a9
+size 4114004696

model-00003-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81055b6cfee64e74086c33d86c6b7bf8eacfaf18a50d581ba7978fff9c527772
+size 4060577896

model-00004-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e5bb6f78d213dc4e1d3d1f6f24024ead5965dbe390f82103aaf1673d4c8eb1c
+size 4114004760

model-00005-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbab38afe043b79b2317b3dad52ef775d8db9118ec7ddc853c90bc493dbdb08f
+size 4078691328

model-00006-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0a72ea744731942728ec1fd2f65baa81d04fb874f58508af39148b091b5d13
+size 4114004776

model-00007-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:302f8cdbf7297b85123c8019763d7e71eaa22a39f565f05dc2307a1a388e60ca
+size 4060577968

model-00008-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96b29a0435673a9d8bcf78ddbe59b56c362b53fc398ec27606cef32824750edc
+size 4114004760

model-00009-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dbd044aa80d61a90bc1504ec9de8ed7e57679d4f91b480bfb820104296dc14a
+size 4060577968

model-00010-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79e63b341b18e2aa064ad1c665c12c69038c1de6566f07a8991fc500192533c2
+size 4114004760

model-00011-of-00011.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c1fce15a053187fc757cc4f4db442ff0435340452d106c1afee2c0068c50f31
+size 1323641448

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,687 @@

+{
+  "metadata": {
+    "total_size": 42444902400,
+    "format": "safetensors"
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00001-of-00011.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.a0": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.a1": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.a2": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.g1": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.g2": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.key.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.output.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.receptance.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.value.weight": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.w0": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.w1": "model-00001-of-00011.safetensors",
+    "model.layers.0.self_attn.w2": "model-00001-of-00011.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.a0": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.a1": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.a2": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.g1": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.g2": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.key.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.output.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.receptance.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.value.weight": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.w0": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.w1": "model-00001-of-00011.safetensors",
+    "model.layers.1.self_attn.w2": "model-00001-of-00011.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00011.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.a0": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.a1": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.a2": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.g1": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.g2": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.key.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.output.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.receptance.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.value.weight": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.w0": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.w1": "model-00002-of-00011.safetensors",
+    "model.layers.2.self_attn.w2": "model-00002-of-00011.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.a0": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.a1": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.a2": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.g1": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.g2": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.key.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.output.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.receptance.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.value.weight": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.w0": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.w1": "model-00002-of-00011.safetensors",
+    "model.layers.4.self_attn.w2": "model-00002-of-00011.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.a0": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.a1": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.a2": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.g1": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.g2": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.key.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.output.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.receptance.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.value.weight": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.w0": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.w1": "model-00002-of-00011.safetensors",
+    "model.layers.5.self_attn.w2": "model-00002-of-00011.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00011.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00011.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.6.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.7.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.9.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.a0": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.a1": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.a2": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.g1": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.g2": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.key.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.output.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.receptance.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.value.weight": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.w0": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.w1": "model-00003-of-00011.safetensors",
+    "model.layers.10.self_attn.w2": "model-00003-of-00011.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00011.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.a0": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.a1": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.a2": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.g1": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.g2": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.key.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.output.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.receptance.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.value.weight": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.w0": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.w1": "model-00004-of-00011.safetensors",
+    "model.layers.11.self_attn.w2": "model-00004-of-00011.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.a0": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.a1": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.a2": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.g1": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.g2": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.key.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.output.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.receptance.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.value.weight": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.w0": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.w1": "model-00004-of-00011.safetensors",
+    "model.layers.12.self_attn.w2": "model-00004-of-00011.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.a0": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.a1": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.a2": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.g1": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.g2": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.key.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.output.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.receptance.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.value.weight": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.w0": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.w1": "model-00004-of-00011.safetensors",
+    "model.layers.13.self_attn.w2": "model-00004-of-00011.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00011.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.15.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.16.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.17.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.18.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.a0": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.a1": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.a2": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.g1": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.g2": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.key.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.output.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.receptance.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.value.weight": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.w0": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.w1": "model-00005-of-00011.safetensors",
+    "model.layers.19.self_attn.w2": "model-00005-of-00011.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00011.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.a0": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.a1": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.a2": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.g1": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.g2": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.key.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.output.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.receptance.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.value.weight": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.w0": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.w1": "model-00006-of-00011.safetensors",
+    "model.layers.21.self_attn.w2": "model-00006-of-00011.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.a0": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.a1": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.a2": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.g1": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.g2": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.key.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.output.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.receptance.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.value.weight": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.w0": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.w1": "model-00006-of-00011.safetensors",
+    "model.layers.22.self_attn.w2": "model-00006-of-00011.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.a0": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.a1": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.a2": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.g1": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.g2": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.key.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.output.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.receptance.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.value.weight": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.w0": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.w1": "model-00006-of-00011.safetensors",
+    "model.layers.23.self_attn.w2": "model-00006-of-00011.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00006-of-00011.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00011.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.24.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.26.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.27.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.a0": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.a1": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.a2": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.g1": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.g2": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.key.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.output.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.receptance.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.value.weight": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.w0": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.w1": "model-00007-of-00011.safetensors",
+    "model.layers.28.self_attn.w2": "model-00007-of-00011.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00007-of-00011.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.a0": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.a1": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.a2": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.g1": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.g2": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.key.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.output.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.receptance.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.value.weight": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.w0": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.w1": "model-00008-of-00011.safetensors",
+    "model.layers.29.self_attn.w2": "model-00008-of-00011.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.a0": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.a1": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.a2": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.g1": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.g2": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.key.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.output.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.receptance.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.value.weight": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.w0": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.w1": "model-00008-of-00011.safetensors",
+    "model.layers.31.self_attn.w2": "model-00008-of-00011.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.a0": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.a1": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.a2": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.g1": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.g2": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.key.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.output.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.receptance.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.value.weight": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.w0": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.w1": "model-00008-of-00011.safetensors",
+    "model.layers.32.self_attn.w2": "model-00008-of-00011.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00008-of-00011.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00008-of-00011.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.33.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.34.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.36.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.a0": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.a1": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.a2": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.g1": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.g2": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.key.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.output.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.receptance.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.value.weight": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.w0": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.w1": "model-00009-of-00011.safetensors",
+    "model.layers.37.self_attn.w2": "model-00009-of-00011.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00009-of-00011.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.a0": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.a1": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.a2": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.g1": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.g2": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.key.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.output.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.receptance.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.value.weight": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.w0": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.w1": "model-00010-of-00011.safetensors",
+    "model.layers.38.self_attn.w2": "model-00010-of-00011.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.a0": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.a1": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.a2": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.g1": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.g2": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.key.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.output.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.receptance.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.value.weight": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.w0": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.w1": "model-00010-of-00011.safetensors",
+    "model.layers.40.self_attn.w2": "model-00010-of-00011.safetensors",
+    "model.layers.41.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.mlp.up_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.post_attention_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.a0": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.a1": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.a2": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.g1": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.g2": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.key.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.output.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.receptance.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.value.weight": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.w0": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.w1": "model-00010-of-00011.safetensors",
+    "model.layers.41.self_attn.w2": "model-00010-of-00011.safetensors",
+    "model.layers.42.input_layernorm.weight": "model-00010-of-00011.safetensors",
+    "model.layers.42.mlp.down_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.42.mlp.gate_proj.weight": "model-00010-of-00011.safetensors",
+    "model.layers.42.mlp.up_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.post_attention_layernorm.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.a0": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.a1": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.a2": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.g1": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.g2": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.key.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.output.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.receptance.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.value.weight": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.w0": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.w1": "model-00011-of-00011.safetensors",
+    "model.layers.42.self_attn.w2": "model-00011-of-00011.safetensors",
+    "model.layers.43.input_layernorm.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.mlp.down_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.mlp.gate_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.mlp.up_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.post_attention_layernorm.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.k_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.o_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.q_proj.weight": "model-00011-of-00011.safetensors",
+    "model.layers.43.self_attn.v_proj.weight": "model-00011-of-00011.safetensors",
+    "model.norm.weight": "model-00011-of-00011.safetensors"
+  }
+}

modeling_rwkv07aqwen3.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch RWKV07AQwen3 model.
+base code from SmerkyG @ recursal.ai, featherless.ai
+hxa07A implementation RWKV07A + NoPE Hybrid Attention
+"""
+import math
+import inspect
+from typing import List, Optional, Tuple, Union, Dict, Any
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, CacheLayerMixin
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
+from transformers.utils.generic import check_model_inputs
+from .configuration_rwkv07aqwen3 import RWKV07AQwen3Config
+from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3MLP, Qwen3RMSNorm, Qwen3Attention
+class RWKV07AState():
+    def __init__(self) -> None:
+        #super().__init__()
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        self.layer_kv_states: List[torch.Tensor] = []
+        self.layer_shift_states:  List[torch.Tensor] = []
+        self.cumulative_scores: List[torch.Tensor] = []
+        self.sin: List[torch.Tensor] = []
+        self.cos: List[torch.Tensor] = []
+    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.layer_kv_states)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Linear Attention variants do not have a maximum length
+        return new_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        raise NotImplementedError('Cannot reorder Linear Attention state')
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        return self._seen_tokens
+    def get_max_cache_shape(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
+        return None
+    def get_max_length(self) -> Optional[int]:
+        """
+        Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.
+        """
+        return None
+    def crop(self, max_length: int):
+        # can't implement this for linear attention variants
+        return
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the mask"""
+        kv_offset = 0
+        query_length = cache_position.shape[0]
+        past_seen_tokens = self.get_seq_length()
+        kv_length = query_length + past_seen_tokens
+        return kv_length, kv_offset
+    @property
+    def is_compileable(self) -> bool:
+        """Return whether the cache is compileable"""
+        return True #all(layer.is_compileable for layer in self.layers)
+    @torch.no_grad
+    def update(
+        self,
+        kv_state: torch.Tensor,
+        shift_state: torch.Tensor,
+        layer_idx: int,
+        token_count: int = 0,
+        is_attention_layer: bool = True,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            if is_attention_layer:
+                token_count = kv_state.size(-2)
+            self._seen_tokens += token_count
+        #print(f'self._seen_tokens = {self._seen_tokens} layer_idx = {layer_idx} is_attention_layer = {is_attention_layer} kv_state.size(-2) = {kv_state.size(-2)}')
+        # Update the cache
+        if kv_state is not None:
+            # There may be skipped layers, fill them with empty lists
+            if layer_idx >= len(self.layer_kv_states):
+                for _ in range(len(self.layer_kv_states), layer_idx):
+                    if is_attention_layer:
+                        self.layer_kv_states.append(torch.tensor([], dtype=kv_state.dtype, device=kv_state.device)) # acts as key_cache
+                        self.layer_shift_states.append(torch.tensor([], dtype=shift_state.dtype, device=shift_state.device)) # acts as value_cache
+                    else:
+                        self.layer_kv_states.append(torch.zeros_like(kv_state).requires_grad_(False))
+                        self.layer_shift_states.append(torch.zeros_like(shift_state).requires_grad_(False))
+                self.layer_kv_states.append(kv_state) # acts as key_cache
+                self.layer_shift_states.append(shift_state) # acts as value_cache
+            else:
+                if is_attention_layer:
+                    self.layer_kv_states[layer_idx] = torch.cat([self.layer_kv_states[layer_idx], kv_state], dim=-2) # acts as key_cache
+                    self.layer_shift_states[layer_idx] = torch.cat([self.layer_shift_states[layer_idx], shift_state], dim=-2) # acts as value_cache
+                else:
+                    self.layer_kv_states[layer_idx].copy_(kv_state)
+                    self.layer_shift_states[layer_idx].copy_(shift_state)
+        return self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx]
+try:
+    from fla.ops.rwkv7.chunk import chunk_rwkv7
+    from fla.ops.rwkv7.fused_recurrent import fused_recurrent_rwkv7
+except ImportError:
+    print("Required module is not installed. Please install it using the following commands:")
+    print("pip install --no-use-pep517 flash-linear-attention")
+    print("Additionally, ensure you have at least version 2.2.0 of Triton installed:")
+    print("pip install triton>=2.2.0")
+# def is_layer_attention(config, layer_id):
+#     return layer_id >= config.first_attention_layer and layer_id < config.first_post_attention_layer and  (layer_id > min(config.num_hidden_layers, config.last_striping_layer) or (min(config.num_hidden_layers-1, config.last_striping_layer) - layer_id) % config.attention_striping == 0)
+def is_layer_attention(config, layer_id):
+    return layer_id in config.transformer_layers
+def repeat_kv_rwkv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    Repeat KV heads along the head dimension (GQA).
+    Input:  (B, T, H_kv, D)
+    Output: (B, T, H_kv * n_rep, D)
+    """
+    B, T, H_kv, D = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    # Expand head dim
+    hidden_states = hidden_states[:, :, :, None, :]  # (B, T, H_kv, 1, D)
+    hidden_states = hidden_states.expand(B, T, H_kv, n_rep, D)  # (B, T, H_kv, n_rep, D)
+    return hidden_states.reshape(B, T, H_kv * n_rep, D).contiguous()
+def T5RMSNorm(hidden_states,weight,variance_epsilon:float=1e-6):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return (weight * hidden_states).to(input_dtype)
+def compute_qwen3_rope_cache(seq_len, rotary_dim, device, dtype, rope_theta):
+            half_dim = rotary_dim // 2
+            freq_seq = torch.arange(half_dim, dtype=dtype, device=device)
+            inv_freq = 1.0 / (rope_theta ** (freq_seq / half_dim))
+            positions = torch.arange(seq_len, dtype=dtype, device=device)
+            freqs = torch.einsum("i,j->ij", positions, inv_freq)
+            emb = torch.cat([freqs, freqs], dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+            return cos.unsqueeze(0), sin.unsqueeze(0), inv_freq
+# def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+#     """Applies Rotary Position Embedding to the query and key tensors.
+#     Args:
+#         q (`torch.Tensor`): The query tensor.
+#         k (`torch.Tensor`): The key tensor.
+#         cos (`torch.Tensor`): The cosine part of the rotary embedding.
+#         sin (`torch.Tensor`): The sine part of the rotary embedding.
+#         position_ids (`torch.Tensor`, *optional*):
+#             Deprecated and unused.
+#         unsqueeze_dim (`int`, *optional*, defaults to 1):
+#             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+#             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+#             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+#             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+#             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+#             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+#     Returns:
+#         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+#     """
+#     cos = cos.unsqueeze(unsqueeze_dim)
+#     sin = sin.unsqueeze(unsqueeze_dim)
+#     q_embed = (q * cos) + (rotate_half(q) * sin)
+#     k_embed = (k * cos) + (rotate_half(k) * sin)
+#     return q_embed, k_embed
+class Qwen3RotaryEmbedding(nn.Module):
+    def __init__(self, config: RWKV07AQwen3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rms_norm(hidden_states, eps = 1e-6):
+    #print('ugyuugyu')
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    return hidden_states.to(input_dtype)
+def generate_rotary_embedding(max_seqlen:int, dim:int, theta:float = 10000.0, scale:float = 1):
+    #inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float).to(device) / dim))
+    angular_velocity = theta ** -(torch.arange(0, dim, 2, dtype=torch.float) / dim) / scale # frequencies from 1.0 ... 1/theta
+    angles = torch.outer(torch.arange(max_seqlen), angular_velocity)
+    # Different from paper, but it uses a different permutation in order to obtain the same calculation
+    emb = torch.cat((angles, angles), dim=-1)
+    return torch.stack([emb.cos(), emb.sin()], dim=0)
+    #return torch.polar(torch.ones_like(angles), angles)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def apply_rotary_pos_emb_single(x, cos, sin, unsqueeze_dim=1):
+    return (x * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(x) * sin.unsqueeze(unsqueeze_dim))
+from typing import Callable, Optional, Tuple, Union
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = attn_weights.masked_fill(attn_weights.isnan(), 0) # IMPORTANT FOR BATCHED INFERENCE IN LM EVAL!
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention, create_mask
+from functools import lru_cache
+block_mask = None
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+        is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias = attn_mask + attn_bias
+    if enable_gqa:
+        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_weight = query.float() @ key.float().transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias.float()
+    #attn_weight = stable_softmax(attn_weight, dim=-1)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    attn_weight = attn_weight.masked_fill(attn_weight.isnan(), 0) # IMPORTANT FOR BATCHED INFERENCE IN LM EVAL!
+    #attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    return attn_weight @ value.float()
+class Qwen3AttentionNoPE_Causal(Qwen3Attention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        frozen_residual: torch.Tensor,
+        v_first: Optional[torch.Tensor] = None,
+        k_first: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        x = hidden_states
+        B, L, D = x.size()
+        input_shape = x.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        if self.config.enable_qk_norm:
+            q = self.q_norm(self.q_proj(x).view(hidden_shape)).transpose(1, 2)
+            k = self.k_norm(self.k_proj(x).view(hidden_shape)).transpose(1, 2)
+        else:
+            q = self.q_proj(x).view(hidden_shape).transpose(1, 2)
+            k = self.k_proj(x).view(hidden_shape).transpose(1, 2)
+        v = self.v_proj(x).view(hidden_shape).transpose(1, 2)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            k, v = past_key_values.update(k, v, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        k = repeat_kv(k, self.num_key_value_groups)
+        v = repeat_kv(v, self.num_key_value_groups)
+        S = k.size(-2)
+        y = nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=0.0, attn_mask=attention_mask, is_causal=attention_mask is None and L==S)
+        y = y.transpose(1,2)
+        y = y.reshape(*input_shape, -1)#.contiguous()
+        y = self.o_proj(y)
+        attn_weights = None
+        return y, v_first, k_first
+class RWKV07AAttention(nn.Module):
+    """
+    This is a simplified RWKV block that prioritizes inference efficiency.
+    Decay and Gate are increased to minimize performance degradation.
+    from RWKV v7
+    1. delete Tokenshift
+    2. delete GroupNorm
+    3. delete r_k
+    4. delete v_first
+    5. changed iclr 1-w+a
+    6. big decaysize
+    """
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        C = self.hidden_size = config.hidden_size
+        H = self.num_heads = config.num_attention_heads
+        H_kv = config.num_key_value_heads
+        N = self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.receptance = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.output = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_out_bias
+        )
+        lora_rank_decay = config.lora_rank_decay
+        lora_rank_iclr = config.lora_rank_iclr
+        lora_rank_gate = config.lora_rank_gate
+        self.w0 = nn.Parameter(torch.empty(1,1,H*N))
+        self.w1 = nn.Parameter(torch.empty(C, lora_rank_decay))
+        self.w2 = nn.Parameter(torch.empty(lora_rank_decay, H*N))
+        self.a0 = nn.Parameter(torch.empty(1,1,H*N))
+        self.a1 = nn.Parameter(torch.empty(C, lora_rank_iclr))
+        self.a2 = nn.Parameter(torch.empty(lora_rank_iclr, H*N))
+        self.g1 = nn.Parameter(torch.empty(C, lora_rank_gate))
+        self.g2 = nn.Parameter(torch.empty(lora_rank_gate, H*N))
+        if self.config.enable_qk_norm:
+            self.r_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+            self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        frozen_residual: torch.Tensor,
+        v_first: Optional[torch.Tensor] = None,
+        k_first: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[RWKV07AState] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        if attention_mask is not None:
+            assert len(attention_mask.shape) in (2, 4)
+        output_shift_state = hidden_states[:, -1:].detach().clone()
+        x = hidden_states
+        B, T, C = hidden_states.shape
+        H = self.num_heads
+        N = self.head_dim
+        q_len = T
+        if use_cache and past_key_values is not None and len(past_key_values) > self.layer_idx:
+            #print(f'use past state layer {self.layer_idx}')
+            input_vk_state, input_shift_state = past_key_values[self.layer_idx]
+        else:
+            input_vk_state, input_shift_state = torch.zeros(B,H,N,N, dtype=torch.bfloat16,device=x.device), torch.zeros_like(x[:, -1:])
+        xr = xw = xk = xv = xa = xg = x
+        r = self.receptance(xr).view(B,T,-1,N)
+        w = -F.softplus(-(self.w0 + torch.tanh(xw @ self.w1) @ self.w2)) -0.5
+        k = self.key(xk).view(B,T,-1,N)
+        if self.config.enable_qk_norm:
+            r = self.r_norm(r)
+            k = self.k_norm(k)
+        v = self.value(xv).view(B,T,-1,N)
+        a = torch.sigmoid(self.a0 + (xa @ self.a1) @ self.a2)
+        g = torch.sigmoid(xg @ self.g1) @ self.g2
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            r, k = apply_rotary_pos_emb(r, k, cos, sin, unsqueeze_dim=2)
+        #for left-padding
+        if attention_mask is not None:
+            if attention_mask is not None:
+                if attention_mask.ndim == 2:
+                    # [B, S]
+                    mask = attention_mask[:, -T:]             # [B, T]
+                    v = v * mask[:, :, None, None]            # → [B, T, 1, 1] に拡張して掛け算
+                elif attention_mask.ndim == 4:
+                    # [B, 1, L, S]
+                    mask = attention_mask[:, 0, -1, -T:]      # [B, T]
+                    v = v * mask[:, :, None, None]            # 同上
+        # repeat k/v heads if n_kv_heads < n_heads
+        k = repeat_kv_rwkv(k, self.num_key_value_groups).view(B, T, -1)
+        v = repeat_kv_rwkv(v, self.num_key_value_groups).view(B, T, -1)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        kk = (k).view(B,T,H,-1).float()
+        kk = (kk / (torch.norm(kk, dim=-1, keepdim=True) + 1e-12)).view(B,T,-1).to(k.dtype)
+        k = k * (1.0 - w + a)
+        aa = -kk
+        bb = kk * a
+        w = -w.exp()
+        r_,w_,k_,v_,aa_,bb_ = [i.view(B,T,H,N) for i in [r,w,k,v,aa,bb]]
+        x, output_vk_state = fused_recurrent_rwkv7(r_, w_, k_, v_, aa_, bb_, scale=1.0, initial_state=input_vk_state, output_final_state=True, head_first=False)
+        x = x.view(B,T,-1) * (float(N) ** -0.5)
+        x = x * g
+        x = self.output(x)
+        if past_key_values is not None:
+            past_key_values.update(output_vk_state, output_shift_state, self.layer_idx, q_len, is_layer_attention(self.config, self.layer_idx))
+        return x, v_first, k_first
+class RWKV07AQwen3DecoderLayer(nn.Module):
+    def __init__(self, config: RWKV07AQwen3Config, layer_idx: int):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        if is_layer_attention(config, layer_idx):
+            print(f'layer {layer_idx} : attention')
+            att_fn = Qwen3AttentionNoPE_Causal #Qwen3KeyQuant #Qwen3SWAPrefill #Qwen3DropoutSWASink #Qwen3AttentionNoPE #Qwen3MOBA #Qwen3AttentionVerticalSparse # Qwen3DoubleAttention # Qwen3SymPow #Qwen3Chunk #Qwen3Power #Qwen3MOBA #Qwen3Attention # Qwen3NewAttention # Qwen3AttentionAdapted
+        else:
+            print(f'layer {layer_idx} : rwkv')
+            att_fn = RWKV07AAttention
+        self.self_attn = att_fn(config, layer_idx)
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        frozen_residual: torch.Tensor,
+        v_first: Optional[torch.Tensor],
+        k_first: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, v_first, k_first = self.self_attn(
+            hidden_states=hidden_states,
+            frozen_residual=frozen_residual,
+            v_first=v_first,
+            k_first=k_first,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            #is_causal=True,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states, v_first,k_first,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+@auto_docstring
+class RWKV07AQwen3PreTrainedModel(PreTrainedModel):
+    config: RWKV07AQwen3Config
+    config_class = RWKV07AQwen3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RWKV07AQwen3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    # def _init_weights(self, module):
+    #     std = self.config.initializer_range
+    #     if isinstance(module, nn.Linear):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+@auto_docstring
+class RWKV07AQwen3Model(RWKV07AQwen3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
+    Args:
+        config: RWKV07AQwen3Config
+    """
+    def __init__(self, config: RWKV07AQwen3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [RWKV07AQwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+        # Initialize weights and apply final processing
+        self.post_init()
+    #@check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and not isinstance(past_key_values, RWKV07AState):
+            past_key_values = RWKV07AState()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        if self.config.use_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        v_first = None
+        k_first = None
+        frozen_residual = None
+        for decoder_layer in self.layers:
+            if not is_layer_attention(self.config, decoder_layer.layer_idx):
+                frozen_residual = hidden_states#rms_norm(hidden_states)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            attention_mask = causal_mask_mapping[decoder_layer.attention_type]
+            if attention_mask is not None and attention_mask.ndim == 1:
+                attention_mask = None
+            #attention_mask = None
+            layer_outputs = decoder_layer(
+                hidden_states,
+                frozen_residual=frozen_residual,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                v_first=v_first,
+                k_first=k_first
+            )
+            hidden_states = layer_outputs[0]
+            v_first = layer_outputs[1]
+            k_first = layer_outputs[2]
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        #if return_legacy_cache:
+        #    next_cache = next_cache.to_legacy_cache()
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class RWKV07AQwen3ForCausalLM(RWKV07AQwen3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = RWKV07AQwen3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, RWKV07AQwen3ForCausalLM
+        >>> model = RWKV07AQwen3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        # # run the prefill only up to the last token, then run one more for the actual result
+        # # we do this so that called code doesn't have to handle the dichotomy specially and can just check for L==1
+        # for i in range(2):
+        #     all_but_one = max(1, input_ids.size(-1)-1)
+        #     iid = input_ids[..., i*all_but_one:(i+1)*all_but_one]
+        #     if iid.size(-1) == 0:
+        #         continue
+        #     pids = position_ids
+        #     if pids is not None:
+        #         pids = position_ids[..., i*all_but_one:(i+1)*all_but_one]
+        #     cp = cache_position
+        #     if cp is not None:
+        #         cp = cache_position[..., i*all_but_one:(i+1)*all_but_one]
+        #     rv = self.forward_inner(iid, attention_mask=attention_mask, position_ids=pids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, cache_position=cp, num_logits_to_keep=num_logits_to_keep, **loss_kwargs)
+        #     past_key_values = rv.past_key_values
+    #     return rv
+    # def forward_inner(
+    #     self,
+    #     input_ids: torch.LongTensor = None,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.LongTensor] = None,
+    #     past_key_values: Optional[List[torch.FloatTensor]] = None,
+    #     inputs_embeds: Optional[torch.FloatTensor] = None,
+    #     labels: Optional[torch.LongTensor] = None,
+    #     use_cache: Optional[bool] = None,
+    #     output_attentions: Optional[bool] = None,
+    #     output_hidden_states: Optional[bool] = None,
+    #     cache_position: Optional[torch.LongTensor] = None,
+    #     num_logits_to_keep: int = 0,
+    #     **loss_kwargs,
+    # ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **loss_kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class RWKV07AQwen3ForSequenceClassification(RWKV07AQwen3PreTrainedModel):
+    pass
+@auto_docstring
+class RWKV07AQwen3ForTokenClassification(RWKV07AQwen3PreTrainedModel):
+    pass
+@auto_docstring
+class RWKV07AQwen3ForQuestionAnswering(RWKV07AQwen3PreTrainedModel):
+    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>",
+    "<|endofprompt|>",
+    "<|_unuse_missing_100256|>",
+    "<|_unuse_missing_100261|>",
+    "<|_unuse_missing_100262|>",
+    "<|_unuse_missing_100263|>",
+    "<|_unuse_missing_100264|>",
+    "<|_unuse_missing_100265|>",
+    "<|_unuse_missing_100266|>",
+    "<|_unuse_missing_100267|>",
+    "<|_unuse_missing_100268|>",
+    "<|_unuse_missing_100269|>",
+    "<|_unuse_missing_100270|>",
+    "<|_unuse_missing_100271|>",
+    "<|_unuse_missing_100272|>",
+    "<|_unuse_missing_100273|>",
+    "<|_unuse_missing_100274|>",
+    "<|_unuse_missing_100275|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_rwkv07aqwen3.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers.models.qwen3.tokenization_qwen3 import Qwen3Tokenizer
+class RWKV6Qwen3Tokenizer(Qwen3Tokenizer):
+    pass

tokenization_rwkv07aqwen3_fast.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers.models.qwen2.tokenization_qwen3_fast import Qwen3TokenizerFast
+class RWKV6Qwen3TokenizerFast(Qwen3TokenizerFast):
+    pass

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,204 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<|_unuse_missing_100256|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100257": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100261": {
+      "content": "<|_unuse_missing_100261|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100262": {
+      "content": "<|_unuse_missing_100262|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100263": {
+      "content": "<|_unuse_missing_100263|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100264": {
+      "content": "<|_unuse_missing_100264|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<|_unuse_missing_100265|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<|_unuse_missing_100266|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100267": {
+      "content": "<|_unuse_missing_100267|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100268": {
+      "content": "<|_unuse_missing_100268|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100269": {
+      "content": "<|_unuse_missing_100269|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100270": {
+      "content": "<|_unuse_missing_100270|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100271": {
+      "content": "<|_unuse_missing_100271|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100272": {
+      "content": "<|_unuse_missing_100272|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100273": {
+      "content": "<|_unuse_missing_100273|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100274": {
+      "content": "<|_unuse_missing_100274|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100275": {
+      "content": "<|_unuse_missing_100275|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100276": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>",
+    "<|endofprompt|>",
+    "<|_unuse_missing_100256|>",
+    "<|_unuse_missing_100261|>",
+    "<|_unuse_missing_100262|>",
+    "<|_unuse_missing_100263|>",
+    "<|_unuse_missing_100264|>",
+    "<|_unuse_missing_100265|>",
+    "<|_unuse_missing_100266|>",
+    "<|_unuse_missing_100267|>",
+    "<|_unuse_missing_100268|>",
+    "<|_unuse_missing_100269|>",
+    "<|_unuse_missing_100270|>",
+    "<|_unuse_missing_100271|>",
+    "<|_unuse_missing_100272|>",
+    "<|_unuse_missing_100273|>",
+    "<|_unuse_missing_100274|>",
+    "<|_unuse_missing_100275|>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set merged_content = messages[0]['content'] + ' ' + messages[1]['content'] %}{% set merged_messages = [{'role': messages[1]['role'], 'content': merged_content}] + messages[2:] %}{% else %}{% set merged_messages = messages %}{% endif %}{% for message in merged_messages %}{{('human' if message['role'] == 'user' else message['role']) + ': ' + (message['content'].split('<reasoning>')|first + message['content'].split('</reasoning>')|last if message['role'] == 'assistant' and '</reasoning>' in message['content'] else message['content'])}}{% if (loop.last and add_generation_prompt and merged_messages[-1]['role'] != 'assistant') or not loop.last %}{{ ' <sep> ' }}{% endif %}{% endfor %}{% if add_generation_prompt and merged_messages[-1]['role'] != 'assistant' %}{{ 'assistant:' }}{% endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}