diff --git a/.gitattributes b/.gitattributes index b608da52cc01e1d5e396ff911cfc2f139e5d17be..37c424a39a90e613f2739522e2a4821347a4dae2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5262,3 +5262,12 @@ neuronxcc-2.21.18209.0+043b1bf7/MODULE_f9260d832dabcf299e0e+877608f3/model.neff neuronxcc-2.21.18209.0+043b1bf7/MODULE_1b5caf61147adc2d934e+747527b0/model.neff filter=lfs diff=lfs merge=lfs -text neuronxcc-2.21.18209.0+043b1bf7/MODULE_1b5caf61147adc2d934e+747527b0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text neuronxcc-2.21.18209.0+043b1bf7/MODULE_58bfab3ea35f7cda10d3+877608f3/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/1002e526666aa6d374df.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/1002e526666aa6d374df.json new file mode 100644 index 0000000000000000000000000000000000000000..e63e9bd705eac6f2e74a5a43a22b2b53c8a04c02 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/1002e526666aa6d374df.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-2b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.015625, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "logits_scaling": 8.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct", + "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 5000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/415a488f8e9bfd810f69.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/415a488f8e9bfd810f69.json new file mode 100644 index 0000000000000000000000000000000000000000..51229028c94e1067132f8b5e97340014092ebcdd --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/415a488f8e9bfd810f69.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-2b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.015625, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "logits_scaling": 8.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct", + "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 5000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/83949fddd59377cbb674.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/83949fddd59377cbb674.json new file mode 100644 index 0000000000000000000000000000000000000000..bb3c96798046b480d0d16970d5b8ab3708ecede9 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/83949fddd59377cbb674.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-2b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.015625, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "logits_scaling": 8.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct", + "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 5000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/a4e730448c44c446c2c5.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/a4e730448c44c446c2c5.json new file mode 100644 index 0000000000000000000000000000000000000000..6082f47736e645aeefc73034f7f80f64b57e84d5 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/a4e730448c44c446c2c5.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-2b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.015625, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "logits_scaling": 8.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct", + "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 5000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/08eeed134fa4e527271c.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/08eeed134fa4e527271c.json new file mode 100644 index 0000000000000000000000000000000000000000..9252ef159c857644e6bd528d516e7d241d47b9fd --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/08eeed134fa4e527271c.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-8b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12800, + "logits_scaling": 16.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct", + "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/1fe296005f1eff947583.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/1fe296005f1eff947583.json new file mode 100644 index 0000000000000000000000000000000000000000..cd7840b6318be9018f4855f071ec5c9d72eff8b9 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/1fe296005f1eff947583.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-8b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12800, + "logits_scaling": 16.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct", + "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/30dc5285b1aae437b520.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/30dc5285b1aae437b520.json new file mode 100644 index 0000000000000000000000000000000000000000..62c747e4b0f8ed8a2f2ae5c1114631e855b1d2c3 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/30dc5285b1aae437b520.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-8b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12800, + "logits_scaling": 16.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct", + "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/64c646616b24b2a8d43c.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/64c646616b24b2a8d43c.json new file mode 100644 index 0000000000000000000000000000000000000000..fc40aa125eba130ec493ca7dc05a4d27cb2e7f77 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/64c646616b24b2a8d43c.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-8b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12800, + "logits_scaling": 16.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct", + "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/9aed265427cc6cb86d4b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/9aed265427cc6cb86d4b.json new file mode 100644 index 0000000000000000000000000000000000000000..ea4dcbcf1d8a6f8eaf35bb0116d8d367d55323e4 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/9aed265427cc6cb86d4b.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "ibm-granite/granite-3.1-8b-instruct", + "_task": "text-generation", + "architectures": [ + "GraniteForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.1, + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12800, + "logits_scaling": 16.0, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "granite", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct", + "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 49155 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0740ab092d02484487fb.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0740ab092d02484487fb.json new file mode 100644 index 0000000000000000000000000000000000000000..6ac0daeead181972c85e3577821adb519e09c64a --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0740ab092d02484487fb.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0fc67cd324a7c1a05100.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0fc67cd324a7c1a05100.json new file mode 100644 index 0000000000000000000000000000000000000000..1b1ee958323f2a039284f6539ac869df6c375c62 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0fc67cd324a7c1a05100.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/226856c4c5cdfd69aa89.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/226856c4c5cdfd69aa89.json new file mode 100644 index 0000000000000000000000000000000000000000..1c2a6b9bed2b0977515f9dbb98dc709376eeba91 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/226856c4c5cdfd69aa89.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/341a9cc68e1b4eded838.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/341a9cc68e1b4eded838.json new file mode 100644 index 0000000000000000000000000000000000000000..6cd253d8be9d9f364bad8c752cb351df659e1df6 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/341a9cc68e1b4eded838.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/3ac9f00c63887961a784.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/3ac9f00c63887961a784.json new file mode 100644 index 0000000000000000000000000000000000000000..8bf08519a91196d240190ee390fb591ff33a2d62 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/3ac9f00c63887961a784.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/8d70cd76e737aaa4eaa4.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/8d70cd76e737aaa4eaa4.json new file mode 100644 index 0000000000000000000000000000000000000000..6189d59a41bb4bf011838e04fdddbcb67f0f4c29 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/8d70cd76e737aaa4eaa4.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ad6ef9f317fb8e1ab4f1.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ad6ef9f317fb8e1ab4f1.json new file mode 100644 index 0000000000000000000000000000000000000000..c556fe95e4e659edb081a73aaabb34c77a7f621f --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ad6ef9f317fb8e1ab4f1.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/e6eb0587815d37abaf03.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/e6eb0587815d37abaf03.json new file mode 100644 index 0000000000000000000000000000000000000000..72d10bff218df8270a23e5a3359d734325722cdf --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/e6eb0587815d37abaf03.json @@ -0,0 +1,56 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "lmsys/vicuna-7b-v1.5", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "lmsys/vicuna-7b-v1.5", + "checkpoint_revision": "3321f76e3f527bd14065daf69dad9344000a201d", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "float16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/eb93c62140353ba54657.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/eb93c62140353ba54657.json new file mode 100644 index 0000000000000000000000000000000000000000..95a3fe70710290772537d010c27ed46377f23620 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/eb93c62140353ba54657.json @@ -0,0 +1,56 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "lmsys/vicuna-7b-v1.5", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "lmsys/vicuna-7b-v1.5", + "checkpoint_revision": "3321f76e3f527bd14065daf69dad9344000a201d", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "float16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/8a8971a0da11451cb8a9.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/8a8971a0da11451cb8a9.json new file mode 100644 index 0000000000000000000000000000000000000000..bae7ce7d084da97a632278a591992cc31fbc44e3 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/8a8971a0da11451cb8a9.json @@ -0,0 +1,56 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5504, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "checkpoint_revision": "a4b76938edbf571ea7d7d9904861cbdca08809b4", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "float16", + "tp_degree": 2 + }, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/d0e265b870b2f9fc91c5.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/d0e265b870b2f9fc91c5.json new file mode 100644 index 0000000000000000000000000000000000000000..d6d04b382edef94cfa85fd62cfd83987f0154381 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/d0e265b870b2f9fc91c5.json @@ -0,0 +1,56 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5504, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "checkpoint_revision": "a4b76938edbf571ea7d7d9904861cbdca08809b4", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "float16", + "tp_degree": 2 + }, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 32000 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/62b8172ee838a29e1e7f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/62b8172ee838a29e1e7f.json new file mode 100644 index 0000000000000000000000000000000000000000..3e26f24e6b43f0abcd0a42529a0be8ed25e1fb5c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/62b8172ee838a29e1e7f.json @@ -0,0 +1,63 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "unsloth/Llama-3.2-1B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "unsloth/Llama-3.2-1B", + "checkpoint_revision": "9535bd9b1d1dea6acafbdc4813b728796aeb28da", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/f19511a53b988b95bb49.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/f19511a53b988b95bb49.json new file mode 100644 index 0000000000000000000000000000000000000000..895a806f12520114ce0d5b43bf8c751bbf07faea --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/f19511a53b988b95bb49.json @@ -0,0 +1,63 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "unsloth/Llama-3.2-1B", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "unsloth/Llama-3.2-1B", + "checkpoint_revision": "9535bd9b1d1dea6acafbdc4813b728796aeb28da", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/115ac93cb9174db4e67f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/115ac93cb9174db4e67f.json new file mode 100644 index 0000000000000000000000000000000000000000..861553713c607dfe9a8201373e72fd3ace2c7beb --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/115ac93cb9174db4e67f.json @@ -0,0 +1,190 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 2, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 1048576, + "model_type": "llama4_text", + "moe_layers": [ + 1, + 3, + 5, + 7, + 9, + 11, + 13, + 15, + 17, + 19, + 21, + 23, + 25, + 27, + 29, + 31, + 33, + 35, + 37, + 39, + 41, + 43, + 45, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "checkpoint_revision": "73d14711bcc77c16df3470856949c3764056b617", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 64, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 64 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 128, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": false, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/f70dea2be77b8d1dc8ed.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/f70dea2be77b8d1dc8ed.json new file mode 100644 index 0000000000000000000000000000000000000000..493c86b0a85b41d82554bdd577c3966417db47cd --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/f70dea2be77b8d1dc8ed.json @@ -0,0 +1,190 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 2, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 1048576, + "model_type": "llama4_text", + "moe_layers": [ + 1, + 3, + 5, + 7, + 9, + 11, + 13, + 15, + 17, + 19, + 21, + 23, + 25, + 27, + 29, + 31, + 33, + 35, + 37, + 39, + 41, + 43, + 45, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "checkpoint_revision": "73d14711bcc77c16df3470856949c3764056b617", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 64, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 64 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 128, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": false, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/07656ae2a159358e76ff.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/07656ae2a159358e76ff.json new file mode 100644 index 0000000000000000000000000000000000000000..a5a95cdc17c08377cacdfba42de86303d5bbcf71 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/07656ae2a159358e76ff.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/0b9d19926bec30ac4419.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/0b9d19926bec30ac4419.json new file mode 100644 index 0000000000000000000000000000000000000000..81ae6148b26eef1b5cc187dde3d1cf5dd50b9365 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/0b9d19926bec30ac4419.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/236b23417ad1c79fbb5f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/236b23417ad1c79fbb5f.json new file mode 100644 index 0000000000000000000000000000000000000000..760ef03fab37a08b8b4e9581c03feedb905f3291 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/236b23417ad1c79fbb5f.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/2a96ee4639be3796f16b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/2a96ee4639be3796f16b.json new file mode 100644 index 0000000000000000000000000000000000000000..f4319a8fb3b08f7b48387c3a06c0824f6238db3b --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/2a96ee4639be3796f16b.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/621024dbf42a03b7babc.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/621024dbf42a03b7babc.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf1357634971428503b4ebbcf53eb99d7686320 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/621024dbf42a03b7babc.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/923fcd4cf259579b5e4a.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/923fcd4cf259579b5e4a.json new file mode 100644 index 0000000000000000000000000000000000000000..8e5b96845d7e5ac136db6516082ed7ef15b0404c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/923fcd4cf259579b5e4a.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/a90ff1e995579ec8deee.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/a90ff1e995579ec8deee.json new file mode 100644 index 0000000000000000000000000000000000000000..d91c1bf4ab1fd4cc8c76fb1db3d82ccff976cc56 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/a90ff1e995579ec8deee.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 16, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 16 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/c3450e1affaca20e05e3.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/c3450e1affaca20e05e3.json new file mode 100644 index 0000000000000000000000000000000000000000..9365e0aefa067bb8348641e15a55b9a868bfc4e3 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/c3450e1affaca20e05e3.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 16, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 16 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/d33cefe3ad2c77e0544b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/d33cefe3ad2c77e0544b.json new file mode 100644 index 0000000000000000000000000000000000000000..2a6d74f8be469a5504c206bf2fb6304acdf23743 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/d33cefe3ad2c77e0544b.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn2", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/f71e619c760aaf9e2888.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/f71e619c760aaf9e2888.json new file mode 100644 index 0000000000000000000000000000000000000000..dd3b35e83cfcc95cf76fe811bdd930a6de3b7915 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/f71e619c760aaf9e2888.json @@ -0,0 +1,220 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "_task": "text-generation", + "attention_bias": false, + "attention_chunk_size": 8192, + "attention_dropout": 0.0, + "attn_scale": 0.1, + "attn_temperature_tuning": true, + "floor_scale": 8192, + "for_llm_compressor": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "interleave_moe_layer_step": 1, + "intermediate_size": 8192, + "intermediate_size_mlp": 16384, + "layer_types": [ + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention", + "chunked_attention", + "chunked_attention", + "chunked_attention", + "full_attention" + ], + "max_position_embeddings": 10485760, + "model_type": "llama4_text", + "moe_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 40, + "num_experts_per_tok": 1, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "num_local_experts": 16, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 16.0, + "high_freq_factor": 1.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "router_aux_loss_coef": 0.001, + "router_jitter_noise": 0.0, + "tie_word_embeddings": false, + "use_cache": true, + "use_qk_norm": true, + "vocab_size": 202048 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/Phi-3-mini-4k-instruct/3558b5ac7259b6bcc01a.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/Phi-3-mini-4k-instruct/3558b5ac7259b6bcc01a.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd403157e7e8ccb064801a9273ab0138381edf8 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/Phi-3-mini-4k-instruct/3558b5ac7259b6bcc01a.json @@ -0,0 +1,62 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "microsoft/Phi-3-mini-4k-instruct", + "_task": "text-generation", + "architectures": [ + "Phi3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "embd_pdrop": 0.0, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 4096, + "model_type": "phi3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "microsoft/Phi-3-mini-4k-instruct", + "checkpoint_revision": "0a67737cc96d2554230f90338b163bc6380a2a85", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "original_max_position_embeddings": 4096, + "partial_rotary_factor": 1.0, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "sliding_window": 2047, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 32064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/38f87915d107c55b7651.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/38f87915d107c55b7651.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6f35a64b384dd657e89eee83087de40bc083c2 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/38f87915d107c55b7651.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "microsoft/phi-4", + "_task": "text-generation", + "architectures": [ + "Phi3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "embd_pdrop": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17920, + "max_position_embeddings": 16384, + "model_type": "phi3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "microsoft/phi-4", + "checkpoint_revision": "187ef0342fff0eb3333be9f00389385e95ef0b61", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 10, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 10 + }, + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 10, + "original_max_position_embeddings": 16384, + "partial_rotary_factor": 1.0, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 250000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 100352 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/473a4f2462bcd8b3f136.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/473a4f2462bcd8b3f136.json new file mode 100644 index 0000000000000000000000000000000000000000..8255e424fe12b554bb94482cbc0c34d257b65202 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/473a4f2462bcd8b3f136.json @@ -0,0 +1,58 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "microsoft/phi-4", + "_task": "text-generation", + "architectures": [ + "Phi3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "embd_pdrop": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17920, + "max_position_embeddings": 16384, + "model_type": "phi3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "microsoft/phi-4", + "checkpoint_revision": "187ef0342fff0eb3333be9f00389385e95ef0b61", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 10, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 10 + }, + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 10, + "original_max_position_embeddings": 16384, + "partial_rotary_factor": 1.0, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 250000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "vocab_size": 100352 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-0.5B/300b37dace1ce2c0b783.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-0.5B/300b37dace1ce2c0b783.json new file mode 100644 index 0000000000000000000000000000000000000000..9b4983e35c80749c0f0d3592202d3c798be20918 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-0.5B/300b37dace1ce2c0b783.json @@ -0,0 +1,82 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-0.5B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 896, + "initializer_range": 0.02, + "intermediate_size": 4864, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 24, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-0.5B", + "checkpoint_revision": "060db6499f32faf8b98477b0a26969ef7d8b9987", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 14, + "num_hidden_layers": 24, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/8d982941157412579546.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/8d982941157412579546.json new file mode 100644 index 0000000000000000000000000000000000000000..97c5cddc5ec36f40ba4152a7406837a20eb50b7b --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/8d982941157412579546.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-1.5B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-1.5B", + "checkpoint_revision": "8faed761d45a263340a0528343f099c05c9a4323", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/dea81904d370c8b20332.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/dea81904d370c8b20332.json new file mode 100644 index 0000000000000000000000000000000000000000..86b25fa5a821fb3fe4d89df99f6417794cced673 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/dea81904d370c8b20332.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-1.5B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-1.5B", + "checkpoint_revision": "8faed761d45a263340a0528343f099c05c9a4323", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/877be4240e4a459b2a14.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/877be4240e4a459b2a14.json new file mode 100644 index 0000000000000000000000000000000000000000..be792c620aa1294fe9f00a9281699b601af0692b --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/877be4240e4a459b2a14.json @@ -0,0 +1,105 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-14B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 48, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-14B", + "checkpoint_revision": "97e1e76335b7017d8f67c08a19d103c0504298c9", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/c05ba11ec3a01458a2e6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/c05ba11ec3a01458a2e6.json new file mode 100644 index 0000000000000000000000000000000000000000..9c580fbced4dd8ce67c708d11da17ea4c042cbb0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/c05ba11ec3a01458a2e6.json @@ -0,0 +1,105 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-14B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 48, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-14B", + "checkpoint_revision": "97e1e76335b7017d8f67c08a19d103c0504298c9", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/2e5ba8f801dbc7a16c3c.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/2e5ba8f801dbc7a16c3c.json new file mode 100644 index 0000000000000000000000000000000000000000..6d1b16452a48a71ca8c1622a6b38fd4a08350e5c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/2e5ba8f801dbc7a16c3c.json @@ -0,0 +1,121 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-32B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 27648, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 70, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-32B-Instruct", + "checkpoint_revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/7182911a8d43e7187430.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/7182911a8d43e7187430.json new file mode 100644 index 0000000000000000000000000000000000000000..dc04b2f53f65275df716bbc8ed82e15662589be3 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/7182911a8d43e7187430.json @@ -0,0 +1,121 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-32B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 27648, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 70, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-32B-Instruct", + "checkpoint_revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-72B-Instruct/f3b6f76004dc3d143c7e.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-72B-Instruct/f3b6f76004dc3d143c7e.json new file mode 100644 index 0000000000000000000000000000000000000000..091bf6e2289fa581cabed5bde5cae9a97f16bfde --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-72B-Instruct/f3b6f76004dc3d143c7e.json @@ -0,0 +1,137 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-72B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 29568, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 70, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-72B-Instruct", + "checkpoint_revision": "495f39366efef23836d0cfae4fbe635880d2be31", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 24, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 24 + }, + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/2ffd57bb17f3a35919c6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/2ffd57bb17f3a35919c6.json new file mode 100644 index 0000000000000000000000000000000000000000..e6c12b99e4fdedbc8246497d044a61d3d1a46561 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/2ffd57bb17f3a35919c6.json @@ -0,0 +1,85 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-7B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct", + "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/85aeb3e82bb9189fa256.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/85aeb3e82bb9189fa256.json new file mode 100644 index 0000000000000000000000000000000000000000..23996a547081dc3dce119f07724068645250b542 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/85aeb3e82bb9189fa256.json @@ -0,0 +1,85 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-7B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct", + "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/b256ce0e46280fedadb4.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/b256ce0e46280fedadb4.json new file mode 100644 index 0000000000000000000000000000000000000000..fe2a29cda7f0d0a7f81014238bfd5ee20bc2de6f --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/b256ce0e46280fedadb4.json @@ -0,0 +1,85 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-7B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct", + "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/bbe60ad043d0675f6bd9.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/bbe60ad043d0675f6bd9.json new file mode 100644 index 0000000000000000000000000000000000000000..1b6d8048472985bacb9d5622fb4658ae2482467d --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/bbe60ad043d0675f6bd9.json @@ -0,0 +1,85 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-7B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct", + "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/f5f95ccf7b688a5d5891.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/f5f95ccf7b688a5d5891.json new file mode 100644 index 0000000000000000000000000000000000000000..331782ef0836c96c86b1e7b696af0cbd12f7c690 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/f5f95ccf7b688a5d5891.json @@ -0,0 +1,85 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen2.5-7B-Instruct", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct", + "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/620024a33245674faae6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/620024a33245674faae6.json new file mode 100644 index 0000000000000000000000000000000000000000..7798b445b14170988cd5e13bda886c9a184355bb --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/620024a33245674faae6.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "checkpoint_revision": "ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/e4a7cd4f6d3a0a6376e6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/e4a7cd4f6d3a0a6376e6.json new file mode 100644 index 0000000000000000000000000000000000000000..45077bc825448945035e8f29be351cc07baa3c93 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/e4a7cd4f6d3a0a6376e6.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 21, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "checkpoint_revision": "ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/103e1d0b4a1ea69c88de.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/103e1d0b4a1ea69c88de.json new file mode 100644 index 0000000000000000000000000000000000000000..3f6cd47f9f54410b307e57c352c2f0b4dc089c66 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/103e1d0b4a1ea69c88de.json @@ -0,0 +1,105 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 48, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "checkpoint_revision": "1df8507178afcc1bef68cd8c393f61a886323761", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/117c3927e485f748b0b3.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/117c3927e485f748b0b3.json new file mode 100644 index 0000000000000000000000000000000000000000..06dac799ff8fc66c5dc9f6f4ceadb2a9a4938fa8 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/117c3927e485f748b0b3.json @@ -0,0 +1,105 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 13824, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 48, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "checkpoint_revision": "1df8507178afcc1bef68cd8c393f61a886323761", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 48, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/6e1e7be6896b12bfe647.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/6e1e7be6896b12bfe647.json new file mode 100644 index 0000000000000000000000000000000000000000..ad946f5f488a1d6c161b785bb177ad86b559774b --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/6e1e7be6896b12bfe647.json @@ -0,0 +1,121 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 27648, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 64, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "checkpoint_revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/92ac2670e24578b76628.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/92ac2670e24578b76628.json new file mode 100644 index 0000000000000000000000000000000000000000..7aaf0529953055f78884e94eba951ea5c3b704b9 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/92ac2670e24578b76628.json @@ -0,0 +1,121 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 27648, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 64, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "checkpoint_revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/4fc34a0fe0b2ddb70eeb.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/4fc34a0fe0b2ddb70eeb.json new file mode 100644 index 0000000000000000000000000000000000000000..0dcc90ba297937bf1237a4e57c9937219a7f58df --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/4fc34a0fe0b2ddb70eeb.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/6da6a64ceddfb46291c2.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/6da6a64ceddfb46291c2.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d1fb1ee5284808a496beccf6c3ae992a0f59 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/6da6a64ceddfb46291c2.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/a369790ad746f4be941f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/a369790ad746f4be941f.json new file mode 100644 index 0000000000000000000000000000000000000000..e52f0e220b5564e603433d2472ba4a01c62f1edb --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/a369790ad746f4be941f.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/c03ab0614c104d0ae1dd.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/c03ab0614c104d0ae1dd.json new file mode 100644 index 0000000000000000000000000000000000000000..163e26e21cbde207f2c9ea69c3e86029b7b890a2 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/c03ab0614c104d0ae1dd.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/f599c0c1b0d07420a759.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/f599c0c1b0d07420a759.json new file mode 100644 index 0000000000000000000000000000000000000000..bbc8f793b0d705f7fa295e47d6e3d1dda9433eb9 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/f599c0c1b0d07420a759.json @@ -0,0 +1,86 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "_task": "text-generation", + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-0.6B/9cb4695f65b75935af10.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-0.6B/9cb4695f65b75935af10.json new file mode 100644 index 0000000000000000000000000000000000000000..8dc313cd69109ec8d84eba7b7e2e78be33cf6678 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-0.6B/9cb4695f65b75935af10.json @@ -0,0 +1,87 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-0.6B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-0.6B", + "checkpoint_revision": "c1899de289a04d12100db370d81485cdf75e47ca", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/17609b4e638ec72b48fc.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/17609b4e638ec72b48fc.json new file mode 100644 index 0000000000000000000000000000000000000000..af77927bfd6c7c09d3992cf899a917f7d22e18e2 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/17609b4e638ec72b48fc.json @@ -0,0 +1,87 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-1.7B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-1.7B", + "checkpoint_revision": "70d244cc86ccca08cf5af4e1e306ecf908b1ad5e", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/e3288048cb22fcf1ff30.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/e3288048cb22fcf1ff30.json new file mode 100644 index 0000000000000000000000000000000000000000..d23807c13c913c3133f5d39c55bfc1e1e92fdea1 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/e3288048cb22fcf1ff30.json @@ -0,0 +1,87 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-1.7B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-1.7B", + "checkpoint_revision": "70d244cc86ccca08cf5af4e1e306ecf908b1ad5e", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/0e78320275316eedde3d.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/0e78320275316eedde3d.json new file mode 100644 index 0000000000000000000000000000000000000000..41efb12c90a8727fbf8877130df09aa54b84cf8c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/0e78320275316eedde3d.json @@ -0,0 +1,99 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-14B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17408, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 40, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-14B", + "checkpoint_revision": "40c069824f4251a91eefaf281ebe4c544efd3e18", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/92b2b64b6f8c31d96d6b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/92b2b64b6f8c31d96d6b.json new file mode 100644 index 0000000000000000000000000000000000000000..c247d4dd6a12a985edecfda95a020310fe27908c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/92b2b64b6f8c31d96d6b.json @@ -0,0 +1,99 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-14B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17408, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 40, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-14B", + "checkpoint_revision": "40c069824f4251a91eefaf281ebe4c544efd3e18", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/541630bae1f8b14b6d79.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/541630bae1f8b14b6d79.json new file mode 100644 index 0000000000000000000000000000000000000000..7eec167fca523316bf875589b953278da7489813 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/541630bae1f8b14b6d79.json @@ -0,0 +1,123 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-32B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 25600, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-32B", + "checkpoint_revision": "9216db5781bf21249d130ec9da846c4624c16137", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 64, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/83a2759427291f576035.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/83a2759427291f576035.json new file mode 100644 index 0000000000000000000000000000000000000000..150d444b9b17f1ae523236bad387b83bc53b1f50 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/83a2759427291f576035.json @@ -0,0 +1,123 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-32B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 25600, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-32B", + "checkpoint_revision": "9216db5781bf21249d130ec9da846c4624c16137", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 64, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/30357cf185cc1e3e5729.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/30357cf185cc1e3e5729.json new file mode 100644 index 0000000000000000000000000000000000000000..568de36f8b640795c0c2660691968e0ebab368ca --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/30357cf185cc1e3e5729.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-4B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-4B", + "checkpoint_revision": "1cfa9a7208912126459214e8b04321603b3df60c", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/f5834a23951a6c432322.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/f5834a23951a6c432322.json new file mode 100644 index 0000000000000000000000000000000000000000..a100549a8ccbfb025705d19074f50bbf493eec1d --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/f5834a23951a6c432322.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-4B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-4B", + "checkpoint_revision": "1cfa9a7208912126459214e8b04321603b3df60c", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/2ed332069721e69332b5.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/2ed332069721e69332b5.json new file mode 100644 index 0000000000000000000000000000000000000000..81adff383ae1df100a67b08a60816005af45bd62 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/2ed332069721e69332b5.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-8B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-8B", + "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/6553031514f6d37fb552.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/6553031514f6d37fb552.json new file mode 100644 index 0000000000000000000000000000000000000000..4592b2b328a72cc98103062f7738eb5a9da96a5f --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/6553031514f6d37fb552.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-8B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-8B", + "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/bab670b614844101ae82.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/bab670b614844101ae82.json new file mode 100644 index 0000000000000000000000000000000000000000..a0a0c6fb06502d8765b4a5755602a6637c11c6f0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/bab670b614844101ae82.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-8B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-8B", + "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d7608dc7ce9ca58d8b1b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d7608dc7ce9ca58d8b1b.json new file mode 100644 index 0000000000000000000000000000000000000000..97bc62125c86016bf4a387bbb3b8beea63d072a9 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d7608dc7ce9ca58d8b1b.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-8B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-8B", + "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d8a1d923e11ecee6ad54.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d8a1d923e11ecee6ad54.json new file mode 100644 index 0000000000000000000000000000000000000000..2475814edde7ec44c318a24a62d894c71d208ca3 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d8a1d923e11ecee6ad54.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-8B", + "_task": "text-generation", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-8B", + "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/045cdae3aba27431eb02.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/045cdae3aba27431eb02.json new file mode 100644 index 0000000000000000000000000000000000000000..507d12823a28c30a7dbb7189ebc31e89818f702b --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/045cdae3aba27431eb02.json @@ -0,0 +1,65 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "_task": "text-generation", + "architectures": [ + "Qwen3MoeForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "decoder_sparse_step": 1, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 262144, + "max_window_layers": 48, + "mlp_only_layers": [], + "model_type": "qwen3_moe", + "moe_intermediate_size": 768, + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 16, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 16 + }, + "norm_topk_prob": true, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 48, + "num_key_value_heads": 4, + "output_router_logits": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.001, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/3876c0cd4f09900753ef.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/3876c0cd4f09900753ef.json new file mode 100644 index 0000000000000000000000000000000000000000..e8071581be73348ceef143f79885e4460c9e2bc8 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/3876c0cd4f09900753ef.json @@ -0,0 +1,65 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "_task": "text-generation", + "architectures": [ + "Qwen3MoeForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "decoder_sparse_step": 1, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 262144, + "max_window_layers": 48, + "mlp_only_layers": [], + "model_type": "qwen3_moe", + "moe_intermediate_size": 768, + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 32, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 32 + }, + "norm_topk_prob": true, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 48, + "num_key_value_heads": 4, + "output_router_logits": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.001, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/5d2586eb779e07e4065b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/5d2586eb779e07e4065b.json new file mode 100644 index 0000000000000000000000000000000000000000..141ff66ddceb876e3df35bdc8880d592e548a896 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/5d2586eb779e07e4065b.json @@ -0,0 +1,65 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "_task": "text-generation", + "architectures": [ + "Qwen3MoeForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "decoder_sparse_step": 1, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 262144, + "max_window_layers": 48, + "mlp_only_layers": [], + "model_type": "qwen3_moe", + "moe_intermediate_size": 768, + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "norm_topk_prob": true, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 48, + "num_key_value_heads": 4, + "output_router_logits": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.001, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/b75f86d49dd04a575d01.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/b75f86d49dd04a575d01.json new file mode 100644 index 0000000000000000000000000000000000000000..3a95cde862cc4af4566c69079550888c07088f1e --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/b75f86d49dd04a575d01.json @@ -0,0 +1,65 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "_task": "text-generation", + "architectures": [ + "Qwen3MoeForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "decoder_sparse_step": 1, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 262144, + "max_window_layers": 48, + "mlp_only_layers": [], + "model_type": "qwen3_moe", + "moe_intermediate_size": 768, + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": false, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "norm_topk_prob": true, + "num_attention_heads": 32, + "num_experts": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 48, + "num_key_value_heads": 4, + "output_router_logits": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.001, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/21cf9a98824cab307a5d.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/21cf9a98824cab307a5d.json new file mode 100644 index 0000000000000000000000000000000000000000..dc264bd832360d6dd0475e69591fdc8be682395c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/21cf9a98824cab307a5d.json @@ -0,0 +1,134 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "HuggingFaceTB/SmolLM3-3B", + "_task": "text-generation", + "architectures": [ + "SmolLM3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 11008, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 65536, + "max_window_layers": 28, + "mlp_bias": false, + "model_type": "smollm3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "HuggingFaceTB/SmolLM3-3B", + "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "no_rope_layer_interval": 4, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 4, + "pretraining_tp": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 5000000.0, + "sliding_window": null, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/2e1b1ec806aecfa37cf0.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/2e1b1ec806aecfa37cf0.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a2810d880a0e21c57374302b0dcd2dff34c7ad --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/2e1b1ec806aecfa37cf0.json @@ -0,0 +1,134 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "HuggingFaceTB/SmolLM3-3B", + "_task": "text-generation", + "architectures": [ + "SmolLM3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 11008, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 65536, + "max_window_layers": 28, + "mlp_bias": false, + "model_type": "smollm3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "HuggingFaceTB/SmolLM3-3B", + "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1", + "continuous_batching": false, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "no_rope_layer_interval": 4, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 4, + "pretraining_tp": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 5000000.0, + "sliding_window": null, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.1.dev0/llama/unsloth/Llama-3.2-1B-Instruct/ee0b934f9d86b0c0ec63.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.1.dev0/llama/unsloth/Llama-3.2-1B-Instruct/ee0b934f9d86b0c0ec63.json new file mode 100644 index 0000000000000000000000000000000000000000..03b10083c6d8ecf2037c1ebd43029d8d990a46f4 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.1.dev0/llama/unsloth/Llama-3.2-1B-Instruct/ee0b934f9d86b0c0ec63.json @@ -0,0 +1,63 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "unsloth/Llama-3.2-1B-Instruct", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct", + "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c", + "continuous_batching": true, + "enable_bucketing": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.18209.0+043b1bf7", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.1.dev0", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..9c5a330b9f7fdba6bb4c837889e33f6c9e6384d0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2920e5fde77a80d2d66c9ba5addf46e53d032f533c7097cba34e31243f84ed +size 588406 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..89f8d97d359d63ebd5917c0100016bc9789a9aae --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9306abbd1c28ffa1ebe3d952ac122ce49faf6a2a037575b24b0a818cb7d441e3 +size 1926144 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..eaf6b0a379d86fbca5273154dc6c60ef6e31a1f8 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a9c49a56aacb28723d72d324e309cb2277db5b847d041e290b5301149a0d39 +size 2082478 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..f108a70705a4b906dcc80c8f6ce11d019f846a84 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbe936df2dd3e7c0552c88652dd566f94ae8eafb06b7049095032f9603622a0e +size 106099506 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.log b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.log new file mode 100644 index 0000000000000000000000000000000000000000..3e5bcaadf078c8ee7c882c0facd807ebc0d70c80 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.log @@ -0,0 +1,3 @@ +Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_084b0ca25510be3eae0e+ed72d204.hlo_module.pb', '--output', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_084b0ca25510be3eae0e+ed72d204.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [XCG815] Estimated peak HBM usage (19.849GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. +2025-10-13T11:49:46Z Non-signal exit. Backend exited with code 1 and stderr: [XCG815] Estimated peak HBM usage (19.849GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. + diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..a4a7f3ed36c7049e6fbe1ef8ce77d04757a2bc49 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bd7c29b2997482a3d5447c9d6db21b6fea798133aafea1111aafe85fc80af51 +size 2081711 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..086d64ac71fc9bd6f8402fc5005fb1c2246c66a2 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21db4a0343706c21e048588561690b2061c787f1e4456d5abdd455064df96102 +size 106127270 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..7a87bfa1185ad326455009b6d3fa090e3018e43d --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8525b10b7b1e2ee02a362c164a364ea9ac01d1f09c3b8a1862168f36a95ff0c2 +size 15197184 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..15ee279ebae57d9cb04aee1c3e397507d5efe8ed --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5658d3aa5ed4dd10106028e20617951b8c4515086735196b94908b3059e5752 +size 15506153 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..7258febe2b95409f1e5a100f8f12882ea7c5fe66 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e98bb691e366aa22d2d9dd2fba85706bd818c47344c0db74a983fa34b92209 +size 103993280 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..7763be4727c6c0f4757ab456536ce9374dd5476e --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b99d837a49291f3f272568a63bffef06ad4b129790ae2cbf99c22ef528ad46f +size 9882624 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..f9a9d7364fc6d1f7f2c1fdcfd980a73b0168076a --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb901e10625a3eb094190d4f0ba055841675a2167f630e11531a72db0a567d4a +size 10191593 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..2d4531f98068d356b3631962f7282c3614f31d1f --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c64a590225fcaa4a7f92ee879a63534714d921b0b7c21d8c0b215eaa21d7970 +size 739558 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..152b4c885f15b08b3d745815316c535a4c9c0c9f --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205be41902a46553702af4e8be043b2df8f16d5813f7f260a1386541828ca3e6 +size 26133504 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..aa1d0c2aa8ffa044f2e2136cef2b71949c5a2064 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb344c672a010536aa058ddd9a19cd7de40cb29e0b785ac541c8dd2fc630085f +size 102926602 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..7586e4162d68dcf2172e36d759f33e377873e481 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b6e713335f2db8daa35894784cef45b53887ac954d0d59885127453500bdca3 +size 7128064 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..a25f973073865b55997494336e0a2ca3d543cade --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:905da3ec1ebfaeb665f2806414ad6bf492f62b129781f762265293a80f8951d1 +size 7437033 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..6d952a9b582eae277f6e42617728d738d45b193c --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:114e077a57d5c1bd85c3f7cdbbe8c5da34c6e1875c9db46e4109da6a55d7f464 +size 106099506 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.log b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.log new file mode 100644 index 0000000000000000000000000000000000000000..4d52380e33260d97b2898c44e6c936763647e0d2 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.log @@ -0,0 +1,3 @@ +Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_fdf838eca836e25ae600+ed72d204.hlo_module.pb', '--output', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_fdf838eca836e25ae600+ed72d204.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [XCG815] Estimated peak HBM usage (19.099GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. +2025-10-13T11:32:53Z Non-signal exit. Backend exited with code 1 and stderr: [XCG815] Estimated peak HBM usage (19.099GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. + diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..a819f6ad8f42ba16fc7ef7eaad2f4b0694353034 --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1740c916e0af0675009dd8dfb70af5f8acc5e17bfd57ed1c96a30c750c671a +size 106099506 diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.log b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.log new file mode 100644 index 0000000000000000000000000000000000000000..fb6909878ac92c31f16cf89f79ed0f4a8ba5563b --- /dev/null +++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.log @@ -0,0 +1,3 @@ +Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_ff564f7b9f2edb395631+ed72d204.hlo_module.pb', '--output', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_ff564f7b9f2edb395631+ed72d204.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [XCG815] Estimated peak HBM usage (21.348GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. +2025-10-13T12:10:29Z Non-signal exit. Backend exited with code 1 and stderr: [XCG815] Estimated peak HBM usage (21.348GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. +