diff --git a/.gitattributes b/.gitattributes
index b608da52cc01e1d5e396ff911cfc2f139e5d17be..37c424a39a90e613f2739522e2a4821347a4dae2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -5262,3 +5262,12 @@ neuronxcc-2.21.18209.0+043b1bf7/MODULE_f9260d832dabcf299e0e+877608f3/model.neff
 neuronxcc-2.21.18209.0+043b1bf7/MODULE_1b5caf61147adc2d934e+747527b0/model.neff filter=lfs diff=lfs merge=lfs -text
 neuronxcc-2.21.18209.0+043b1bf7/MODULE_1b5caf61147adc2d934e+747527b0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
 neuronxcc-2.21.18209.0+043b1bf7/MODULE_58bfab3ea35f7cda10d3+877608f3/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/1002e526666aa6d374df.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/1002e526666aa6d374df.json
new file mode 100644
index 0000000000000000000000000000000000000000..e63e9bd705eac6f2e74a5a43a22b2b53c8a04c02
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/1002e526666aa6d374df.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-2b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.015625,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "logits_scaling": 8.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct",
+    "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/415a488f8e9bfd810f69.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/415a488f8e9bfd810f69.json
new file mode 100644
index 0000000000000000000000000000000000000000..51229028c94e1067132f8b5e97340014092ebcdd
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/415a488f8e9bfd810f69.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-2b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.015625,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "logits_scaling": 8.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct",
+    "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/83949fddd59377cbb674.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/83949fddd59377cbb674.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb3c96798046b480d0d16970d5b8ab3708ecede9
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/83949fddd59377cbb674.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-2b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.015625,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "logits_scaling": 8.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct",
+    "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/a4e730448c44c446c2c5.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/a4e730448c44c446c2c5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6082f47736e645aeefc73034f7f80f64b57e84d5
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-2b-instruct/a4e730448c44c446c2c5.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-2b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.015625,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "logits_scaling": 8.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct",
+    "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/08eeed134fa4e527271c.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/08eeed134fa4e527271c.json
new file mode 100644
index 0000000000000000000000000000000000000000..9252ef159c857644e6bd528d516e7d241d47b9fd
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/08eeed134fa4e527271c.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-8b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.0078125,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct",
+    "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/1fe296005f1eff947583.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/1fe296005f1eff947583.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd7840b6318be9018f4855f071ec5c9d72eff8b9
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/1fe296005f1eff947583.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-8b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.0078125,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct",
+    "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/30dc5285b1aae437b520.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/30dc5285b1aae437b520.json
new file mode 100644
index 0000000000000000000000000000000000000000..62c747e4b0f8ed8a2f2ae5c1114631e855b1d2c3
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/30dc5285b1aae437b520.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-8b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.0078125,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct",
+    "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/64c646616b24b2a8d43c.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/64c646616b24b2a8d43c.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc40aa125eba130ec493ca7dc05a4d27cb2e7f77
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/64c646616b24b2a8d43c.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-8b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.0078125,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct",
+    "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/9aed265427cc6cb86d4b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/9aed265427cc6cb86d4b.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea4dcbcf1d8a6f8eaf35bb0116d8d367d55323e4
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/granite/ibm-granite/granite-3.1-8b-instruct/9aed265427cc6cb86d4b.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-8b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.0078125,
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12800,
+  "logits_scaling": 16.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-8b-instruct",
+    "checkpoint_revision": "4009206d5fc95d2e65a7b7633e159d6e97e25d35",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0740ab092d02484487fb.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0740ab092d02484487fb.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ac0daeead181972c85e3577821adb519e09c64a
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0740ab092d02484487fb.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0fc67cd324a7c1a05100.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0fc67cd324a7c1a05100.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b1ee958323f2a039284f6539ac869df6c375c62
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/0fc67cd324a7c1a05100.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/226856c4c5cdfd69aa89.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/226856c4c5cdfd69aa89.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c2a6b9bed2b0977515f9dbb98dc709376eeba91
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/226856c4c5cdfd69aa89.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/341a9cc68e1b4eded838.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/341a9cc68e1b4eded838.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cd253d8be9d9f364bad8c752cb351df659e1df6
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/341a9cc68e1b4eded838.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/3ac9f00c63887961a784.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/3ac9f00c63887961a784.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bf08519a91196d240190ee390fb591ff33a2d62
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/3ac9f00c63887961a784.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/8d70cd76e737aaa4eaa4.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/8d70cd76e737aaa4eaa4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6189d59a41bb4bf011838e04fdddbcb67f0f4c29
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/8d70cd76e737aaa4eaa4.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ad6ef9f317fb8e1ab4f1.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ad6ef9f317fb8e1ab4f1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c556fe95e4e659edb081a73aaabb34c77a7f621f
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/ad6ef9f317fb8e1ab4f1.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "checkpoint_revision": "6a6f4aa4197940add57724a7707d069478df56b1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/e6eb0587815d37abaf03.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/e6eb0587815d37abaf03.json
new file mode 100644
index 0000000000000000000000000000000000000000..72d10bff218df8270a23e5a3359d734325722cdf
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/e6eb0587815d37abaf03.json
@@ -0,0 +1,56 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "lmsys/vicuna-7b-v1.5",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "lmsys/vicuna-7b-v1.5",
+    "checkpoint_revision": "3321f76e3f527bd14065daf69dad9344000a201d",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "float16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/eb93c62140353ba54657.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/eb93c62140353ba54657.json
new file mode 100644
index 0000000000000000000000000000000000000000..95a3fe70710290772537d010c27ed46377f23620
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/lmsys/vicuna-7b-v1.5/eb93c62140353ba54657.json
@@ -0,0 +1,56 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "lmsys/vicuna-7b-v1.5",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "lmsys/vicuna-7b-v1.5",
+    "checkpoint_revision": "3321f76e3f527bd14065daf69dad9344000a201d",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "float16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/8a8971a0da11451cb8a9.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/8a8971a0da11451cb8a9.json
new file mode 100644
index 0000000000000000000000000000000000000000..bae7ce7d084da97a632278a591992cc31fbc44e3
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/8a8971a0da11451cb8a9.json
@@ -0,0 +1,56 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5504,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+    "checkpoint_revision": "a4b76938edbf571ea7d7d9904861cbdca08809b4",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "float16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/d0e265b870b2f9fc91c5.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/d0e265b870b2f9fc91c5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6d04b382edef94cfa85fd62cfd83987f0154381
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/princeton-nlp/Sheared-LLaMA-1.3B/d0e265b870b2f9fc91c5.json
@@ -0,0 +1,56 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5504,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+    "checkpoint_revision": "a4b76938edbf571ea7d7d9904861cbdca08809b4",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "float16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32000
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/62b8172ee838a29e1e7f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/62b8172ee838a29e1e7f.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e26f24e6b43f0abcd0a42529a0be8ed25e1fb5c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/62b8172ee838a29e1e7f.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B",
+    "checkpoint_revision": "9535bd9b1d1dea6acafbdc4813b728796aeb28da",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/f19511a53b988b95bb49.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/f19511a53b988b95bb49.json
new file mode 100644
index 0000000000000000000000000000000000000000..895a806f12520114ce0d5b43bf8c751bbf07faea
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama/unsloth/Llama-3.2-1B/f19511a53b988b95bb49.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B",
+    "checkpoint_revision": "9535bd9b1d1dea6acafbdc4813b728796aeb28da",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/115ac93cb9174db4e67f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/115ac93cb9174db4e67f.json
new file mode 100644
index 0000000000000000000000000000000000000000..861553713c607dfe9a8201373e72fd3ace2c7beb
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/115ac93cb9174db4e67f.json
@@ -0,0 +1,190 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 2,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 1048576,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    1,
+    3,
+    5,
+    7,
+    9,
+    11,
+    13,
+    15,
+    17,
+    19,
+    21,
+    23,
+    25,
+    27,
+    29,
+    31,
+    33,
+    35,
+    37,
+    39,
+    41,
+    43,
+    45,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+    "checkpoint_revision": "73d14711bcc77c16df3470856949c3764056b617",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 64,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 64
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 128,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/f70dea2be77b8d1dc8ed.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/f70dea2be77b8d1dc8ed.json
new file mode 100644
index 0000000000000000000000000000000000000000..493c86b0a85b41d82554bdd577c3966417db47cd
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Maverick-17B-128E-Instruct/f70dea2be77b8d1dc8ed.json
@@ -0,0 +1,190 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 2,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 1048576,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    1,
+    3,
+    5,
+    7,
+    9,
+    11,
+    13,
+    15,
+    17,
+    19,
+    21,
+    23,
+    25,
+    27,
+    29,
+    31,
+    33,
+    35,
+    37,
+    39,
+    41,
+    43,
+    45,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+    "checkpoint_revision": "73d14711bcc77c16df3470856949c3764056b617",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 64,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 64
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 128,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/07656ae2a159358e76ff.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/07656ae2a159358e76ff.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5a95cdc17c08377cacdfba42de86303d5bbcf71
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/07656ae2a159358e76ff.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/0b9d19926bec30ac4419.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/0b9d19926bec30ac4419.json
new file mode 100644
index 0000000000000000000000000000000000000000..81ae6148b26eef1b5cc187dde3d1cf5dd50b9365
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/0b9d19926bec30ac4419.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/236b23417ad1c79fbb5f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/236b23417ad1c79fbb5f.json
new file mode 100644
index 0000000000000000000000000000000000000000..760ef03fab37a08b8b4e9581c03feedb905f3291
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/236b23417ad1c79fbb5f.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/2a96ee4639be3796f16b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/2a96ee4639be3796f16b.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4319a8fb3b08f7b48387c3a06c0824f6238db3b
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/2a96ee4639be3796f16b.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/621024dbf42a03b7babc.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/621024dbf42a03b7babc.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf1357634971428503b4ebbcf53eb99d7686320
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/621024dbf42a03b7babc.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/923fcd4cf259579b5e4a.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/923fcd4cf259579b5e4a.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e5b96845d7e5ac136db6516082ed7ef15b0404c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/923fcd4cf259579b5e4a.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/a90ff1e995579ec8deee.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/a90ff1e995579ec8deee.json
new file mode 100644
index 0000000000000000000000000000000000000000..d91c1bf4ab1fd4cc8c76fb1db3d82ccff976cc56
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/a90ff1e995579ec8deee.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 16,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 16
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/c3450e1affaca20e05e3.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/c3450e1affaca20e05e3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9365e0aefa067bb8348641e15a55b9a868bfc4e3
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/c3450e1affaca20e05e3.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 16,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 16
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/d33cefe3ad2c77e0544b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/d33cefe3ad2c77e0544b.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a6d74f8be469a5504c206bf2fb6304acdf23743
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/d33cefe3ad2c77e0544b.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn2",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/f71e619c760aaf9e2888.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/f71e619c760aaf9e2888.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd3b35e83cfcc95cf76fe811bdd930a6de3b7915
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/llama4_text/meta-llama/Llama-4-Scout-17B-16E-Instruct/f71e619c760aaf9e2888.json
@@ -0,0 +1,220 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+  "_task": "text-generation",
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "attn_temperature_tuning": true,
+  "floor_scale": 8192,
+  "for_llm_compressor": false,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "interleave_moe_layer_step": 1,
+  "intermediate_size": 8192,
+  "intermediate_size_mlp": 16384,
+  "layer_types": [
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "chunked_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 10485760,
+  "model_type": "llama4_text",
+  "moe_layers": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    40,
+    41,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47
+  ],
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "checkpoint_revision": "92f3b1597a195b523d8d9e5700e57e4fbb8f20d3",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 40,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "num_local_experts": 16,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_qk_norm": true,
+  "vocab_size": 202048
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/Phi-3-mini-4k-instruct/3558b5ac7259b6bcc01a.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/Phi-3-mini-4k-instruct/3558b5ac7259b6bcc01a.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cd403157e7e8ccb064801a9273ab0138381edf8
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/Phi-3-mini-4k-instruct/3558b5ac7259b6bcc01a.json
@@ -0,0 +1,62 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "microsoft/Phi-3-mini-4k-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "embd_pdrop": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 4096,
+  "model_type": "phi3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "microsoft/Phi-3-mini-4k-instruct",
+    "checkpoint_revision": "0a67737cc96d2554230f90338b163bc6380a2a85",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "partial_rotary_factor": 1.0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 2047,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/38f87915d107c55b7651.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/38f87915d107c55b7651.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a6f35a64b384dd657e89eee83087de40bc083c2
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/38f87915d107c55b7651.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "microsoft/phi-4",
+  "_task": "text-generation",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "embd_pdrop": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 16384,
+  "model_type": "phi3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "microsoft/phi-4",
+    "checkpoint_revision": "187ef0342fff0eb3333be9f00389385e95ef0b61",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 10,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 10
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 10,
+  "original_max_position_embeddings": 16384,
+  "partial_rotary_factor": 1.0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 250000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 100352
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/473a4f2462bcd8b3f136.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/473a4f2462bcd8b3f136.json
new file mode 100644
index 0000000000000000000000000000000000000000..8255e424fe12b554bb94482cbc0c34d257b65202
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/phi3/microsoft/phi-4/473a4f2462bcd8b3f136.json
@@ -0,0 +1,58 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "microsoft/phi-4",
+  "_task": "text-generation",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "embd_pdrop": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 16384,
+  "model_type": "phi3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "microsoft/phi-4",
+    "checkpoint_revision": "187ef0342fff0eb3333be9f00389385e95ef0b61",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 10,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 10
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 10,
+  "original_max_position_embeddings": 16384,
+  "partial_rotary_factor": 1.0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 250000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 100352
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-0.5B/300b37dace1ce2c0b783.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-0.5B/300b37dace1ce2c0b783.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b4983e35c80749c0f0d3592202d3c798be20918
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-0.5B/300b37dace1ce2c0b783.json
@@ -0,0 +1,82 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-0.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-0.5B",
+    "checkpoint_revision": "060db6499f32faf8b98477b0a26969ef7d8b9987",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/8d982941157412579546.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/8d982941157412579546.json
new file mode 100644
index 0000000000000000000000000000000000000000..97c5cddc5ec36f40ba4152a7406837a20eb50b7b
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/8d982941157412579546.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-1.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-1.5B",
+    "checkpoint_revision": "8faed761d45a263340a0528343f099c05c9a4323",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/dea81904d370c8b20332.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/dea81904d370c8b20332.json
new file mode 100644
index 0000000000000000000000000000000000000000..86b25fa5a821fb3fe4d89df99f6417794cced673
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-1.5B/dea81904d370c8b20332.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-1.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-1.5B",
+    "checkpoint_revision": "8faed761d45a263340a0528343f099c05c9a4323",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/877be4240e4a459b2a14.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/877be4240e4a459b2a14.json
new file mode 100644
index 0000000000000000000000000000000000000000..be792c620aa1294fe9f00a9281699b601af0692b
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/877be4240e4a459b2a14.json
@@ -0,0 +1,105 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-14B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 48,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-14B",
+    "checkpoint_revision": "97e1e76335b7017d8f67c08a19d103c0504298c9",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/c05ba11ec3a01458a2e6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/c05ba11ec3a01458a2e6.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c580fbced4dd8ce67c708d11da17ea4c042cbb0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-14B/c05ba11ec3a01458a2e6.json
@@ -0,0 +1,105 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-14B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 48,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-14B",
+    "checkpoint_revision": "97e1e76335b7017d8f67c08a19d103c0504298c9",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/2e5ba8f801dbc7a16c3c.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/2e5ba8f801dbc7a16c3c.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d1b16452a48a71ca8c1622a6b38fd4a08350e5c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/2e5ba8f801dbc7a16c3c.json
@@ -0,0 +1,121 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-32B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-32B-Instruct",
+    "checkpoint_revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/7182911a8d43e7187430.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/7182911a8d43e7187430.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc04b2f53f65275df716bbc8ed82e15662589be3
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-32B-Instruct/7182911a8d43e7187430.json
@@ -0,0 +1,121 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-32B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-32B-Instruct",
+    "checkpoint_revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-72B-Instruct/f3b6f76004dc3d143c7e.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-72B-Instruct/f3b6f76004dc3d143c7e.json
new file mode 100644
index 0000000000000000000000000000000000000000..091bf6e2289fa581cabed5bde5cae9a97f16bfde
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-72B-Instruct/f3b6f76004dc3d143c7e.json
@@ -0,0 +1,137 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-72B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 8192,
+  "initializer_range": 0.02,
+  "intermediate_size": 29568,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-72B-Instruct",
+    "checkpoint_revision": "495f39366efef23836d0cfae4fbe635880d2be31",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 24,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 24
+  },
+  "num_attention_heads": 64,
+  "num_hidden_layers": 80,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/2ffd57bb17f3a35919c6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/2ffd57bb17f3a35919c6.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6c12b99e4fdedbc8246497d044a61d3d1a46561
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/2ffd57bb17f3a35919c6.json
@@ -0,0 +1,85 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-7B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct",
+    "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/85aeb3e82bb9189fa256.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/85aeb3e82bb9189fa256.json
new file mode 100644
index 0000000000000000000000000000000000000000..23996a547081dc3dce119f07724068645250b542
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/85aeb3e82bb9189fa256.json
@@ -0,0 +1,85 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-7B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct",
+    "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/b256ce0e46280fedadb4.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/b256ce0e46280fedadb4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe2a29cda7f0d0a7f81014238bfd5ee20bc2de6f
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/b256ce0e46280fedadb4.json
@@ -0,0 +1,85 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-7B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct",
+    "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/bbe60ad043d0675f6bd9.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/bbe60ad043d0675f6bd9.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b6d8048472985bacb9d5622fb4658ae2482467d
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/bbe60ad043d0675f6bd9.json
@@ -0,0 +1,85 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-7B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct",
+    "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/f5f95ccf7b688a5d5891.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/f5f95ccf7b688a5d5891.json
new file mode 100644
index 0000000000000000000000000000000000000000..331782ef0836c96c86b1e7b696af0cbd12f7c690
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/Qwen/Qwen2.5-7B-Instruct/f5f95ccf7b688a5d5891.json
@@ -0,0 +1,85 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-7B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-7B-Instruct",
+    "checkpoint_revision": "a09a35458c702b33eeacc393d103063234e8bc28",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/620024a33245674faae6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/620024a33245674faae6.json
new file mode 100644
index 0000000000000000000000000000000000000000..7798b445b14170988cd5e13bda886c9a184355bb
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/620024a33245674faae6.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "checkpoint_revision": "ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/e4a7cd4f6d3a0a6376e6.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/e4a7cd4f6d3a0a6376e6.json
new file mode 100644
index 0000000000000000000000000000000000000000..45077bc825448945035e8f29be351cc07baa3c93
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/e4a7cd4f6d3a0a6376e6.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "checkpoint_revision": "ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/103e1d0b4a1ea69c88de.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/103e1d0b4a1ea69c88de.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f6cd47f9f54410b307e57c352c2f0b4dc089c66
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/103e1d0b4a1ea69c88de.json
@@ -0,0 +1,105 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 48,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+    "checkpoint_revision": "1df8507178afcc1bef68cd8c393f61a886323761",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/117c3927e485f748b0b3.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/117c3927e485f748b0b3.json
new file mode 100644
index 0000000000000000000000000000000000000000..06dac799ff8fc66c5dc9f6f4ceadb2a9a4938fa8
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/117c3927e485f748b0b3.json
@@ -0,0 +1,105 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 48,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+    "checkpoint_revision": "1df8507178afcc1bef68cd8c393f61a886323761",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/6e1e7be6896b12bfe647.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/6e1e7be6896b12bfe647.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad946f5f488a1d6c161b785bb177ad86b559774b
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/6e1e7be6896b12bfe647.json
@@ -0,0 +1,121 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 64,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+    "checkpoint_revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/92ac2670e24578b76628.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/92ac2670e24578b76628.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aaf0529953055f78884e94eba951ea5c3b704b9
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/92ac2670e24578b76628.json
@@ -0,0 +1,121 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 64,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+    "checkpoint_revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/4fc34a0fe0b2ddb70eeb.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/4fc34a0fe0b2ddb70eeb.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dcc90ba297937bf1237a4e57c9937219a7f58df
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/4fc34a0fe0b2ddb70eeb.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+    "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/6da6a64ceddfb46291c2.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/6da6a64ceddfb46291c2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7d3d1fb1ee5284808a496beccf6c3ae992a0f59
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/6da6a64ceddfb46291c2.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+    "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/a369790ad746f4be941f.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/a369790ad746f4be941f.json
new file mode 100644
index 0000000000000000000000000000000000000000..e52f0e220b5564e603433d2472ba4a01c62f1edb
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/a369790ad746f4be941f.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+    "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/c03ab0614c104d0ae1dd.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/c03ab0614c104d0ae1dd.json
new file mode 100644
index 0000000000000000000000000000000000000000..163e26e21cbde207f2c9ea69c3e86029b7b890a2
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/c03ab0614c104d0ae1dd.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+    "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/f599c0c1b0d07420a759.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/f599c0c1b0d07420a759.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbc8f793b0d705f7fa295e47d6e3d1dda9433eb9
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/f599c0c1b0d07420a759.json
@@ -0,0 +1,86 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+    "checkpoint_revision": "916b56a44061fd5cd7d6a8fb632557ed4f724f60",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-0.6B/9cb4695f65b75935af10.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-0.6B/9cb4695f65b75935af10.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dc313cd69109ec8d84eba7b7e2e78be33cf6678
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-0.6B/9cb4695f65b75935af10.json
@@ -0,0 +1,87 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-0.6B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-0.6B",
+    "checkpoint_revision": "c1899de289a04d12100db370d81485cdf75e47ca",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/17609b4e638ec72b48fc.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/17609b4e638ec72b48fc.json
new file mode 100644
index 0000000000000000000000000000000000000000..af77927bfd6c7c09d3992cf899a917f7d22e18e2
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/17609b4e638ec72b48fc.json
@@ -0,0 +1,87 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-1.7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-1.7B",
+    "checkpoint_revision": "70d244cc86ccca08cf5af4e1e306ecf908b1ad5e",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/e3288048cb22fcf1ff30.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/e3288048cb22fcf1ff30.json
new file mode 100644
index 0000000000000000000000000000000000000000..d23807c13c913c3133f5d39c55bfc1e1e92fdea1
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-1.7B/e3288048cb22fcf1ff30.json
@@ -0,0 +1,87 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-1.7B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-1.7B",
+    "checkpoint_revision": "70d244cc86ccca08cf5af4e1e306ecf908b1ad5e",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/0e78320275316eedde3d.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/0e78320275316eedde3d.json
new file mode 100644
index 0000000000000000000000000000000000000000..41efb12c90a8727fbf8877130df09aa54b84cf8c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/0e78320275316eedde3d.json
@@ -0,0 +1,99 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-14B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17408,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 40,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-14B",
+    "checkpoint_revision": "40c069824f4251a91eefaf281ebe4c544efd3e18",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/92b2b64b6f8c31d96d6b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/92b2b64b6f8c31d96d6b.json
new file mode 100644
index 0000000000000000000000000000000000000000..c247d4dd6a12a985edecfda95a020310fe27908c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-14B/92b2b64b6f8c31d96d6b.json
@@ -0,0 +1,99 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-14B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17408,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 40,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-14B",
+    "checkpoint_revision": "40c069824f4251a91eefaf281ebe4c544efd3e18",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/541630bae1f8b14b6d79.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/541630bae1f8b14b6d79.json
new file mode 100644
index 0000000000000000000000000000000000000000..7eec167fca523316bf875589b953278da7489813
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/541630bae1f8b14b6d79.json
@@ -0,0 +1,123 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-32B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 25600,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 64,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-32B",
+    "checkpoint_revision": "9216db5781bf21249d130ec9da846c4624c16137",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 64,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/83a2759427291f576035.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/83a2759427291f576035.json
new file mode 100644
index 0000000000000000000000000000000000000000..150d444b9b17f1ae523236bad387b83bc53b1f50
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-32B/83a2759427291f576035.json
@@ -0,0 +1,123 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-32B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 25600,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 64,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-32B",
+    "checkpoint_revision": "9216db5781bf21249d130ec9da846c4624c16137",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 64,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/30357cf185cc1e3e5729.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/30357cf185cc1e3e5729.json
new file mode 100644
index 0000000000000000000000000000000000000000..568de36f8b640795c0c2660691968e0ebab368ca
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/30357cf185cc1e3e5729.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-4B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-4B",
+    "checkpoint_revision": "1cfa9a7208912126459214e8b04321603b3df60c",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/f5834a23951a6c432322.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/f5834a23951a6c432322.json
new file mode 100644
index 0000000000000000000000000000000000000000..a100549a8ccbfb025705d19074f50bbf493eec1d
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-4B/f5834a23951a6c432322.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-4B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-4B",
+    "checkpoint_revision": "1cfa9a7208912126459214e8b04321603b3df60c",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/2ed332069721e69332b5.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/2ed332069721e69332b5.json
new file mode 100644
index 0000000000000000000000000000000000000000..81adff383ae1df100a67b08a60816005af45bd62
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/2ed332069721e69332b5.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-8B",
+    "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/6553031514f6d37fb552.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/6553031514f6d37fb552.json
new file mode 100644
index 0000000000000000000000000000000000000000..4592b2b328a72cc98103062f7738eb5a9da96a5f
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/6553031514f6d37fb552.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-8B",
+    "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/bab670b614844101ae82.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/bab670b614844101ae82.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0a0c6fb06502d8765b4a5755602a6637c11c6f0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/bab670b614844101ae82.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-8B",
+    "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d7608dc7ce9ca58d8b1b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d7608dc7ce9ca58d8b1b.json
new file mode 100644
index 0000000000000000000000000000000000000000..97bc62125c86016bf4a387bbb3b8beea63d072a9
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d7608dc7ce9ca58d8b1b.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-8B",
+    "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d8a1d923e11ecee6ad54.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d8a1d923e11ecee6ad54.json
new file mode 100644
index 0000000000000000000000000000000000000000..2475814edde7ec44c318a24a62d894c71d208ca3
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3/Qwen/Qwen3-8B/d8a1d923e11ecee6ad54.json
@@ -0,0 +1,95 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-8B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-8B",
+    "checkpoint_revision": "b968826d9c46dd6066d109eabc6255188de91218",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/045cdae3aba27431eb02.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/045cdae3aba27431eb02.json
new file mode 100644
index 0000000000000000000000000000000000000000..507d12823a28c30a7dbb7189ebc31e89818f702b
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/045cdae3aba27431eb02.json
@@ -0,0 +1,65 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "decoder_sparse_step": 1,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 48,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 768,
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+    "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 16,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 16
+  },
+  "norm_topk_prob": true,
+  "num_attention_heads": 32,
+  "num_experts": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/3876c0cd4f09900753ef.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/3876c0cd4f09900753ef.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8071581be73348ceef143f79885e4460c9e2bc8
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/3876c0cd4f09900753ef.json
@@ -0,0 +1,65 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "decoder_sparse_step": 1,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 48,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 768,
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 8,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+    "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 32,
+    "max_batch_size": 8,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 32
+  },
+  "norm_topk_prob": true,
+  "num_attention_heads": 32,
+  "num_experts": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/5d2586eb779e07e4065b.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/5d2586eb779e07e4065b.json
new file mode 100644
index 0000000000000000000000000000000000000000..141ff66ddceb876e3df35bdc8880d592e548a896
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/5d2586eb779e07e4065b.json
@@ -0,0 +1,65 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "decoder_sparse_step": 1,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 48,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 768,
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+    "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "norm_topk_prob": true,
+  "num_attention_heads": 32,
+  "num_experts": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/b75f86d49dd04a575d01.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/b75f86d49dd04a575d01.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a95cde862cc4af4566c69079550888c07088f1e
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/qwen3_moe/Qwen/Qwen3-30B-A3B-Instruct-2507/b75f86d49dd04a575d01.json
@@ -0,0 +1,65 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "decoder_sparse_step": 1,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 48,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_moe",
+  "moe_intermediate_size": 768,
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+    "checkpoint_revision": "0d7cf23991f47feeb3a57ecb4c9cee8ea4a17bfe",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "norm_topk_prob": true,
+  "num_attention_heads": 32,
+  "num_experts": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/21cf9a98824cab307a5d.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/21cf9a98824cab307a5d.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc264bd832360d6dd0475e69591fdc8be682395c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/21cf9a98824cab307a5d.json
@@ -0,0 +1,134 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "HuggingFaceTB/SmolLM3-3B",
+  "_task": "text-generation",
+  "architectures": [
+    "SmolLM3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 65536,
+  "max_window_layers": 28,
+  "mlp_bias": false,
+  "model_type": "smollm3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 16,
+    "capacity_factor": null,
+    "checkpoint_id": "HuggingFaceTB/SmolLM3-3B",
+    "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 16,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "no_rope_layer_interval": 4,
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "sliding_window": null,
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/2e1b1ec806aecfa37cf0.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/2e1b1ec806aecfa37cf0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2a2810d880a0e21c57374302b0dcd2dff34c7ad
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.0/smollm3/HuggingFaceTB/SmolLM3-3B/2e1b1ec806aecfa37cf0.json
@@ -0,0 +1,134 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "HuggingFaceTB/SmolLM3-3B",
+  "_task": "text-generation",
+  "architectures": [
+    "SmolLM3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 65536,
+  "max_window_layers": 28,
+  "mlp_bias": false,
+  "model_type": "smollm3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "HuggingFaceTB/SmolLM3-3B",
+    "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1",
+    "continuous_batching": false,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "no_rope_layer_interval": 4,
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "sliding_window": null,
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.1.dev0/llama/unsloth/Llama-3.2-1B-Instruct/ee0b934f9d86b0c0ec63.json b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.1.dev0/llama/unsloth/Llama-3.2-1B-Instruct/ee0b934f9d86b0c0ec63.json
new file mode 100644
index 0000000000000000000000000000000000000000..03b10083c6d8ecf2037c1ebd43029d8d990a46f4
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/0_REGISTRY/0.4.1.dev0/llama/unsloth/Llama-3.2-1B-Instruct/ee0b934f9d86b0c0ec63.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": true,
+    "enable_bucketing": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.18209.0+043b1bf7",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.1.dev0",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..9c5a330b9f7fdba6bb4c837889e33f6c9e6384d0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c2920e5fde77a80d2d66c9ba5addf46e53d032f533c7097cba34e31243f84ed
+size 588406
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..89f8d97d359d63ebd5917c0100016bc9789a9aae
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9306abbd1c28ffa1ebe3d952ac122ce49faf6a2a037575b24b0a818cb7d441e3
+size 1926144
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..eaf6b0a379d86fbca5273154dc6c60ef6e31a1f8
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_04def5b319953baacddd+a9d440f5/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6a9c49a56aacb28723d72d324e309cb2277db5b847d041e290b5301149a0d39
+size 2082478
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..f108a70705a4b906dcc80c8f6ce11d019f846a84
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbe936df2dd3e7c0552c88652dd566f94ae8eafb06b7049095032f9603622a0e
+size 106099506
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.log b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.log
new file mode 100644
index 0000000000000000000000000000000000000000..3e5bcaadf078c8ee7c882c0facd807ebc0d70c80
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_084b0ca25510be3eae0e+ed72d204/model.log
@@ -0,0 +1,3 @@
+Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_084b0ca25510be3eae0e+ed72d204.hlo_module.pb', '--output', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_084b0ca25510be3eae0e+ed72d204.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [XCG815]  Estimated peak HBM usage (19.849GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables.
+2025-10-13T11:49:46Z Non-signal exit. Backend exited with code 1 and stderr: [XCG815]  Estimated peak HBM usage (19.849GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables.
+
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..a4a7f3ed36c7049e6fbe1ef8ce77d04757a2bc49
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_33fe5ce4a5a998514a39+a9d440f5/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bd7c29b2997482a3d5447c9d6db21b6fea798133aafea1111aafe85fc80af51
+size 2081711
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..086d64ac71fc9bd6f8402fc5005fb1c2246c66a2
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21db4a0343706c21e048588561690b2061c787f1e4456d5abdd455064df96102
+size 106127270
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..7a87bfa1185ad326455009b6d3fa090e3018e43d
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8525b10b7b1e2ee02a362c164a364ea9ac01d1f09c3b8a1862168f36a95ff0c2
+size 15197184
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..15ee279ebae57d9cb04aee1c3e397507d5efe8ed
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_63b08aa574a103e133be+a9d440f5/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5658d3aa5ed4dd10106028e20617951b8c4515086735196b94908b3059e5752
+size 15506153
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..7258febe2b95409f1e5a100f8f12882ea7c5fe66
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32e98bb691e366aa22d2d9dd2fba85706bd818c47344c0db74a983fa34b92209
+size 103993280
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..7763be4727c6c0f4757ab456536ce9374dd5476e
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b99d837a49291f3f272568a63bffef06ad4b129790ae2cbf99c22ef528ad46f
+size 9882624
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..f9a9d7364fc6d1f7f2c1fdcfd980a73b0168076a
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_7585b7b81ecc283af772+a9d440f5/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb901e10625a3eb094190d4f0ba055841675a2167f630e11531a72db0a567d4a
+size 10191593
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..2d4531f98068d356b3631962f7282c3614f31d1f
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c64a590225fcaa4a7f92ee879a63534714d921b0b7c21d8c0b215eaa21d7970
+size 739558
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..152b4c885f15b08b3d745815316c535a4c9c0c9f
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_a89678b39464c33c1815+ed72d204/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:205be41902a46553702af4e8be043b2df8f16d5813f7f260a1386541828ca3e6
+size 26133504
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..34675115220079309bfad2d45680c478c378a3c0
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.done b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..aa1d0c2aa8ffa044f2e2136cef2b71949c5a2064
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb344c672a010536aa058ddd9a19cd7de40cb29e0b785ac541c8dd2fc630085f
+size 102926602
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..7586e4162d68dcf2172e36d759f33e377873e481
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b6e713335f2db8daa35894784cef45b53887ac954d0d59885127453500bdca3
+size 7128064
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..a25f973073865b55997494336e0a2ca3d543cade
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_bae931052fc7117dae12+a9d440f5/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:905da3ec1ebfaeb665f2806414ad6bf492f62b129781f762265293a80f8951d1
+size 7437033
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..6d952a9b582eae277f6e42617728d738d45b193c
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:114e077a57d5c1bd85c3f7cdbbe8c5da34c6e1875c9db46e4109da6a55d7f464
+size 106099506
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.log b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.log
new file mode 100644
index 0000000000000000000000000000000000000000..4d52380e33260d97b2898c44e6c936763647e0d2
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_fdf838eca836e25ae600+ed72d204/model.log
@@ -0,0 +1,3 @@
+Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_fdf838eca836e25ae600+ed72d204.hlo_module.pb', '--output', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_fdf838eca836e25ae600+ed72d204.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [XCG815]  Estimated peak HBM usage (19.099GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables.
+2025-10-13T11:32:53Z Non-signal exit. Backend exited with code 1 and stderr: [XCG815]  Estimated peak HBM usage (19.099GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables.
+
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/compile_flags.json b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..1569eac18fc7f34b3fe66166f2d7a4a59dbc5aa4
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.hlo_module.pb b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..a819f6ad8f42ba16fc7ef7eaad2f4b0694353034
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb1740c916e0af0675009dd8dfb70af5f8acc5e17bfd57ed1c96a30c750c671a
+size 106099506
diff --git a/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.log b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.log
new file mode 100644
index 0000000000000000000000000000000000000000..fb6909878ac92c31f16cf89f79ed0f4a8ba5563b
--- /dev/null
+++ b/neuronxcc-2.21.18209.0+043b1bf7/MODULE_ff564f7b9f2edb395631+ed72d204/model.log
@@ -0,0 +1,3 @@
+Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_ff564f7b9f2edb395631+ed72d204.hlo_module.pb', '--output', '/tmp/nxd_model/context_encoding_model/_tp0_bk0/model.MODULE_ff564f7b9f2edb395631+ed72d204.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/context_encoding_model/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [XCG815]  Estimated peak HBM usage (21.348GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables.
+2025-10-13T12:10:29Z Non-signal exit. Backend exited with code 1 and stderr: [XCG815]  Estimated peak HBM usage (21.348GB) exceeds 16GB. Neff might be unable to load on chip. If you believe this estimation to be inaccurate, you can disable the check using: `--internal-backend-options=' --disable-hbm-usage-check '` - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables.
+