Upload JambaForCausalLM

Files changed (6) hide show

config.json CHANGED Viewed

@@ -130,7 +130,7 @@
   "mamba_attnaug_config": null,
   "mamba_conv_bias": true,
   "mamba_d_conv": 4,
-  "mamba_d_state": 16,
   "mamba_dt_rank": 192,
   "mamba_expand": 2,
   "mamba_inner_layernorms": true,
@@ -138,7 +138,7 @@
   "mamba_multihead_config": null,
   "mamba_proj_bias": false,
   "mamba_reuse_every_i_layer": -1,
-  "max_position_embeddings": 2048,
   "memory_tokens_interspersed_every": 0,
   "mlp_hidden_act": "silu",
   "mod_topk": 2,
@@ -168,7 +168,7 @@
   "num_key_value_heads": 6,
   "num_mamba": 1,
   "num_memory_tokens": 256,
-  "orig_max_position_embeddings": 2048,
   "other_args": null,
   "output_router_logits": false,
   "pad_token_id": 0,
@@ -180,11 +180,11 @@
   "rms_norm_eps": 1e-06,
   "rope": true,
   "rope_theta": 10000.0,
-  "rope_type": null,
   "router_aux_loss_coef": 0.001,
   "save_input_output": false,
   "self_attn_type": null,
-  "seq_length": 2048,
   "sequential_jamba": false,
   "share_kv": false,
   "shared_module_attn": "",
@@ -195,7 +195,7 @@
   "swa_full_head": false,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.48.2",
   "use_cache": false,
   "use_mamba2": false,
   "use_mamba_kernels": true,

   "mamba_attnaug_config": null,
   "mamba_conv_bias": true,
   "mamba_d_conv": 4,
+  "mamba_d_state": 128,
   "mamba_dt_rank": 192,
   "mamba_expand": 2,
   "mamba_inner_layernorms": true,
   "mamba_multihead_config": null,
   "mamba_proj_bias": false,
   "mamba_reuse_every_i_layer": -1,
+  "max_position_embeddings": 22528,
   "memory_tokens_interspersed_every": 0,
   "mlp_hidden_act": "silu",
   "mod_topk": 2,
   "num_key_value_heads": 6,
   "num_mamba": 1,
   "num_memory_tokens": 256,
+  "orig_max_position_embeddings": 4096,
   "other_args": null,
   "output_router_logits": false,
   "pad_token_id": 0,
   "rms_norm_eps": 1e-06,
   "rope": true,
   "rope_theta": 10000.0,
+  "rope_type": "ntk",
   "router_aux_loss_coef": 0.001,
   "save_input_output": false,
   "self_attn_type": null,
+  "seq_length": 1024,
   "sequential_jamba": false,
   "share_kv": false,
   "shared_module_attn": "",
   "swa_full_head": false,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0",
   "use_cache": false,
   "use_mamba2": false,
   "use_mamba_kernels": true,

generation_config.json CHANGED Viewed

@@ -3,6 +3,6 @@
   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 0,
-  "transformers_version": "4.48.2",
   "use_cache": false
 }

   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 0,
+  "transformers_version": "4.45.0",
   "use_cache": false
 }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73eabedbcef6e7dd78560bb95aced0f5a86de6c067b738503213f634c96f4fbd
-size 4995785984

 version https://git-lfs.github.com/spec/v1
+oid sha256:5e8a0875ed4decf5cbbf676868cbba137f3248a5a592a85597f31614080a25c6
+size 4987939472

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af6a0fc3b40b1c9d005467296052d6039bf6736955a54e0d2427445f791042b8
-size 491849664

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f361fe0361ab0e101d95f5161ee1a724501ae664e0d27c28496d9288b71ebc3
+size 512102640

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "total_size": 5487609216
   },
   "weight_map": {
     "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
@@ -174,7 +174,7 @@
     "model.layers.31.pre_moe_layernorm.weight": "model-00001-of-00002.safetensors",
     "model.layers.32.gla.b_proj.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.k_conv1d.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.gla.k_proj.weight": "model-00001-of-00002.safetensors",
     "model.layers.32.gla.o_norm.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.o_proj.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.q_conv1d.weight": "model-00002-of-00002.safetensors",

 {
   "metadata": {
+    "total_size": 5500015680
   },
   "weight_map": {
     "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
     "model.layers.31.pre_moe_layernorm.weight": "model-00001-of-00002.safetensors",
     "model.layers.32.gla.b_proj.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.k_conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.gla.k_proj.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.o_norm.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.o_proj.weight": "model-00002-of-00002.safetensors",
     "model.layers.32.gla.q_conv1d.weight": "model-00002-of-00002.safetensors",

modeling_jamba.py CHANGED Viewed

@@ -59,6 +59,7 @@ from transformers.utils import (
 from transformers.utils.import_utils import is_torch_fx_available
 from .configuration_jamba import JambaConfig
 from torch.utils.checkpoint import checkpoint
 # try except block so it'll work with trust_remote_code. Later we can have `if is_flash_attn_2_available():`
@@ -3664,7 +3665,7 @@ class JambaModel(JambaPreTrainedModel):
 # Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
-class JambaForCausalLM(JambaPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: JambaConfig):

 from transformers.utils.import_utils import is_torch_fx_available
 from .configuration_jamba import JambaConfig
 from torch.utils.checkpoint import checkpoint
+from transformers.generation.utils import GenerationMixin
 # try except block so it'll work with trust_remote_code. Later we can have `if is_flash_attn_2_available():`
 # Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
+class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: JambaConfig):