diff --git a/.gitattributes b/.gitattributes
index bd1ce75d55cb7108ea0ce21860905d3faddc666b..aa22120df15195a3d3894a1a7ab52df470b99fd7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -15032,3 +15032,19 @@ neuronxcc-2.21.33363.0+82129205/MODULE_d7f7c69ad6f63a27e1a5+a02c3a36/model.neff
 neuronxcc-2.21.33363.0+82129205/MODULE_d7f7c69ad6f63a27e1a5+a02c3a36/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
 neuronxcc-2.21.33363.0+82129205/MODULE_09e98045a01eb4a75c24+a02c3a36/model.neff filter=lfs diff=lfs merge=lfs -text
 neuronxcc-2.21.33363.0+82129205/MODULE_09e98045a01eb4a75c24+a02c3a36/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.neff filter=lfs diff=lfs merge=lfs -text
+neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/granite/ibm-granite/granite-3.1-2b-instruct/0f6c4fdc5392f85cc1a9.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/granite/ibm-granite/granite-3.1-2b-instruct/0f6c4fdc5392f85cc1a9.json
new file mode 100644
index 0000000000000000000000000000000000000000..b830d33c584d5aea28e7dc6294e5c8ddf53618b0
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/granite/ibm-granite/granite-3.1-2b-instruct/0f6c4fdc5392f85cc1a9.json
@@ -0,0 +1,59 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-2b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.015625,
+  "dtype": "bfloat16",
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "logits_scaling": 8.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct",
+    "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": true,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/granite/ibm-granite/granite-3.1-2b-instruct/776cfdcbfedab12abaef.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/granite/ibm-granite/granite-3.1-2b-instruct/776cfdcbfedab12abaef.json
new file mode 100644
index 0000000000000000000000000000000000000000..7171868c43f26f7a64b8f624c6fc2dfe19abcda5
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/granite/ibm-granite/granite-3.1-2b-instruct/776cfdcbfedab12abaef.json
@@ -0,0 +1,59 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "ibm-granite/granite-3.1-2b-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "GraniteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_multiplier": 0.015625,
+  "dtype": "bfloat16",
+  "embedding_multiplier": 12.0,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "logits_scaling": 8.0,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "granite",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "ibm-granite/granite-3.1-2b-instruct",
+    "checkpoint_revision": "bbc2aed595bd38bd770263dc3ab831db9794441d",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 49155
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/52a3b7d021f51c90337f.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/52a3b7d021f51c90337f.json
new file mode 100644
index 0000000000000000000000000000000000000000..25dc89b9850622ce214252c5c202f13cc7990091
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/52a3b7d021f51c90337f.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-3.1-8B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-3.1-8B-Instruct",
+    "checkpoint_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/5fdba651620df09da93d.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/5fdba651620df09da93d.json
new file mode 100644
index 0000000000000000000000000000000000000000..de9077b6d9f87d01975e4bec75ee52ed988f84f9
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/5fdba651620df09da93d.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-3.1-8B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-3.1-8B-Instruct",
+    "checkpoint_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/daa276345bb9b68e9be5.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/daa276345bb9b68e9be5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a26ed4319453dd8c2bbef2e758fc22415255e90
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/daa276345bb9b68e9be5.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-3.1-8B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-3.1-8B-Instruct",
+    "checkpoint_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/e4b573e1a33bbda76243.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/e4b573e1a33bbda76243.json
new file mode 100644
index 0000000000000000000000000000000000000000..adb8195a4f2b3b60362a2b3cf977a1524504e4c4
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.1-8B-Instruct/e4b573e1a33bbda76243.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-3.1-8B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-3.1-8B-Instruct",
+    "checkpoint_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.2-1B-Instruct/ae1848094edb282bfdf1.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.2-1B-Instruct/ae1848094edb282bfdf1.json
new file mode 100644
index 0000000000000000000000000000000000000000..77d61bcfb25131fa6726bc2a2076d2ccb0296d8d
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Llama-3.2-1B-Instruct/ae1848094edb282bfdf1.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "9213176726f574b556790deb65791e0c5aa438b6",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": true,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Meta-Llama-3.1-8B-Instruct/c0cef3ac9ffe05625f39.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Meta-Llama-3.1-8B-Instruct/c0cef3ac9ffe05625f39.json
new file mode 100644
index 0000000000000000000000000000000000000000..72a9b21b7fcf05cb5604bdf118efb17bfeb1d4a7
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/meta-llama/Meta-Llama-3.1-8B-Instruct/c0cef3ac9ffe05625f39.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 32,
+    "capacity_factor": null,
+    "checkpoint_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "checkpoint_revision": "0e9e39f249a16976918f6564b8830bc894c89659",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 8,
+    "max_batch_size": 32,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 8
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/683e082396a4bd7ab4cb.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/683e082396a4bd7ab4cb.json
new file mode 100644
index 0000000000000000000000000000000000000000..19b723b2e1c85b685156064a9631a856601258b9
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/683e082396a4bd7ab4cb.json
@@ -0,0 +1,64 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/79f88038b5962d921f3d.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/79f88038b5962d921f3d.json
new file mode 100644
index 0000000000000000000000000000000000000000..785a9e2a711815916abfb6b200c0c4b993ac91f4
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/79f88038b5962d921f3d.json
@@ -0,0 +1,64 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": true,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/8b175859f95b9fa5a1db.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/8b175859f95b9fa5a1db.json
new file mode 100644
index 0000000000000000000000000000000000000000..519ce3d65d6de50d7a26f29e354c330f23f3bd79
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/8b175859f95b9fa5a1db.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/93fa413575d5ccc52f58.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/93fa413575d5ccc52f58.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bdb8ae36f525bb1d7152dc523bc2eadd2de1549
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/93fa413575d5ccc52f58.json
@@ -0,0 +1,63 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/f453e7e94ffe46947fc0.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/f453e7e94ffe46947fc0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f2fe3703a549b8955a21cc2e54eb3a508f6406d
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/Llama-3.2-1B-Instruct/f453e7e94ffe46947fc0.json
@@ -0,0 +1,64 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/Llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": true,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/llama-3.2-1B-Instruct/c093ba8faccfd3d9f2c2.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/llama-3.2-1B-Instruct/c093ba8faccfd3d9f2c2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c67f9544ca0b119477840bd1ff5a7cc1855b3a86
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/llama/unsloth/llama-3.2-1B-Instruct/c093ba8faccfd3d9f2c2.json
@@ -0,0 +1,64 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "unsloth/llama-3.2-1B-Instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "unsloth/llama-3.2-1B-Instruct",
+    "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "unsloth_fixed": true,
+  "use_cache": true,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/phi3/microsoft/Phi-3.5-mini-instruct/86c6ae54af67ddf4b3b8.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/phi3/microsoft/Phi-3.5-mini-instruct/86c6ae54af67ddf4b3b8.json
new file mode 100644
index 0000000000000000000000000000000000000000..20874db847486c5c5c502b480575f24444324c65
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/phi3/microsoft/Phi-3.5-mini-instruct/86c6ae54af67ddf4b3b8.json
@@ -0,0 +1,165 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "microsoft/Phi-3.5-mini-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "dtype": "bfloat16",
+  "embd_pdrop": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "microsoft/Phi-3.5-mini-instruct",
+    "checkpoint_revision": "2fe192450127e6a83f7441aef6e3ca586c338b77",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": true,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "partial_rotary_factor": 1.0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0800000429153442,
+      1.1100000143051147,
+      1.1399999856948853,
+      1.340000033378601,
+      1.5899999141693115,
+      1.600000023841858,
+      1.6200000047683716,
+      2.620000123977661,
+      3.2300000190734863,
+      3.2300000190734863,
+      4.789999961853027,
+      7.400000095367432,
+      7.700000286102295,
+      9.09000015258789,
+      12.199999809265137,
+      17.670000076293945,
+      24.46000099182129,
+      28.57000160217285,
+      30.420001983642578,
+      30.840002059936523,
+      32.590003967285156,
+      32.93000411987305,
+      42.320003509521484,
+      44.96000289916992,
+      50.340003967285156,
+      50.45000457763672,
+      57.55000305175781,
+      57.93000411987305,
+      58.21000289916992,
+      60.1400032043457,
+      62.61000442504883,
+      62.62000274658203,
+      62.71000289916992,
+      63.1400032043457,
+      63.1400032043457,
+      63.77000427246094,
+      63.93000411987305,
+      63.96000289916992,
+      63.970001220703125,
+      64.02999877929688,
+      64.06999969482422,
+      64.08000183105469,
+      64.12000274658203,
+      64.41000366210938,
+      64.4800033569336,
+      64.51000213623047,
+      64.52999877929688,
+      64.83999633789062
+    ],
+    "short_factor": [
+      1.0,
+      1.0199999809265137,
+      1.0299999713897705,
+      1.0299999713897705,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0699999332427979,
+      1.0999999046325684,
+      1.1099998950958252,
+      1.1599998474121094,
+      1.1599998474121094,
+      1.1699998378753662,
+      1.2899998426437378,
+      1.339999794960022,
+      1.679999828338623,
+      1.7899998426437378,
+      1.8199998140335083,
+      1.8499997854232788,
+      1.8799997568130493,
+      1.9099997282028198,
+      1.9399996995925903,
+      1.9899996519088745,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0799996852874756,
+      2.0899996757507324,
+      2.189999580383301,
+      2.2199995517730713,
+      2.5899994373321533,
+      2.729999542236328,
+      2.749999523162842,
+      2.8399994373321533
+    ],
+    "type": "longrope"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/phi3/microsoft/Phi-3.5-mini-instruct/bb3ccea738a2a8e75fe8.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/phi3/microsoft/Phi-3.5-mini-instruct/bb3ccea738a2a8e75fe8.json
new file mode 100644
index 0000000000000000000000000000000000000000..16da7d1cce4549f120e66756284a4ccade63c069
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/phi3/microsoft/Phi-3.5-mini-instruct/bb3ccea738a2a8e75fe8.json
@@ -0,0 +1,165 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "microsoft/Phi-3.5-mini-instruct",
+  "_task": "text-generation",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "dtype": "bfloat16",
+  "embd_pdrop": 0.0,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "phi3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "microsoft/Phi-3.5-mini-instruct",
+    "checkpoint_revision": "2fe192450127e6a83f7441aef6e3ca586c338b77",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "partial_rotary_factor": 1.0,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1.0800000429153442,
+      1.1100000143051147,
+      1.1399999856948853,
+      1.340000033378601,
+      1.5899999141693115,
+      1.600000023841858,
+      1.6200000047683716,
+      2.620000123977661,
+      3.2300000190734863,
+      3.2300000190734863,
+      4.789999961853027,
+      7.400000095367432,
+      7.700000286102295,
+      9.09000015258789,
+      12.199999809265137,
+      17.670000076293945,
+      24.46000099182129,
+      28.57000160217285,
+      30.420001983642578,
+      30.840002059936523,
+      32.590003967285156,
+      32.93000411987305,
+      42.320003509521484,
+      44.96000289916992,
+      50.340003967285156,
+      50.45000457763672,
+      57.55000305175781,
+      57.93000411987305,
+      58.21000289916992,
+      60.1400032043457,
+      62.61000442504883,
+      62.62000274658203,
+      62.71000289916992,
+      63.1400032043457,
+      63.1400032043457,
+      63.77000427246094,
+      63.93000411987305,
+      63.96000289916992,
+      63.970001220703125,
+      64.02999877929688,
+      64.06999969482422,
+      64.08000183105469,
+      64.12000274658203,
+      64.41000366210938,
+      64.4800033569336,
+      64.51000213623047,
+      64.52999877929688,
+      64.83999633789062
+    ],
+    "short_factor": [
+      1.0,
+      1.0199999809265137,
+      1.0299999713897705,
+      1.0299999713897705,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0499999523162842,
+      1.0699999332427979,
+      1.0999999046325684,
+      1.1099998950958252,
+      1.1599998474121094,
+      1.1599998474121094,
+      1.1699998378753662,
+      1.2899998426437378,
+      1.339999794960022,
+      1.679999828338623,
+      1.7899998426437378,
+      1.8199998140335083,
+      1.8499997854232788,
+      1.8799997568130493,
+      1.9099997282028198,
+      1.9399996995925903,
+      1.9899996519088745,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0199997425079346,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0299997329711914,
+      2.0799996852874756,
+      2.0899996757507324,
+      2.189999580383301,
+      2.2199995517730713,
+      2.5899994373321533,
+      2.729999542236328,
+      2.749999523162842,
+      2.8399994373321533
+    ],
+    "type": "longrope"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "vocab_size": 32064
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen2/Qwen/Qwen2.5-0.5B/53917a90164d81443a08.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen2/Qwen/Qwen2.5-0.5B/53917a90164d81443a08.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5f5b06b4b1607e9a1c7213d5a9c1c63276383d7
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen2/Qwen/Qwen2.5-0.5B/53917a90164d81443a08.json
@@ -0,0 +1,83 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-0.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-0.5B",
+    "checkpoint_revision": "060db6499f32faf8b98477b0a26969ef7d8b9987",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen2/Qwen/Qwen2.5-0.5B/b558da325694139bbaaa.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen2/Qwen/Qwen2.5-0.5B/b558da325694139bbaaa.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa529396bfbf431cb4b76123897a0150ca3a594c
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen2/Qwen/Qwen2.5-0.5B/b558da325694139bbaaa.json
@@ -0,0 +1,83 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen2.5-0.5B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen2.5-0.5B",
+    "checkpoint_revision": "060db6499f32faf8b98477b0a26969ef7d8b9987",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": false,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen3/Qwen/Qwen3-0.6B/1b21de7b61f3d14bbbff.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen3/Qwen/Qwen3-0.6B/1b21de7b61f3d14bbbff.json
new file mode 100644
index 0000000000000000000000000000000000000000..15450fbf77dab14801bd8707e93082ef875a2095
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen3/Qwen/Qwen3-0.6B/1b21de7b61f3d14bbbff.json
@@ -0,0 +1,88 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-0.6B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-0.6B",
+    "checkpoint_revision": "c1899de289a04d12100db370d81485cdf75e47ca",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": false,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen3/Qwen/Qwen3-0.6B/ad8740a6bddc8f6b87a7.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen3/Qwen/Qwen3-0.6B/ad8740a6bddc8f6b87a7.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed61ff2eb808ab3770ee5944010f91345ac933a7
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/qwen3/Qwen/Qwen3-0.6B/ad8740a6bddc8f6b87a7.json
@@ -0,0 +1,88 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "Qwen/Qwen3-0.6B",
+  "_task": "text-generation",
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "Qwen/Qwen3-0.6B",
+    "checkpoint_revision": "c1899de289a04d12100db370d81485cdf75e47ca",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/0165b063d3dfd73ec9af.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/0165b063d3dfd73ec9af.json
new file mode 100644
index 0000000000000000000000000000000000000000..50e84b7b95401b4b70d20d4e7775e5e8511a5c82
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/0165b063d3dfd73ec9af.json
@@ -0,0 +1,135 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "HuggingFaceTB/SmolLM3-3B",
+  "_task": "text-generation",
+  "architectures": [
+    "SmolLM3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 65536,
+  "max_window_layers": 28,
+  "mlp_bias": false,
+  "model_type": "smollm3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 4,
+    "capacity_factor": null,
+    "checkpoint_id": "HuggingFaceTB/SmolLM3-3B",
+    "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1",
+    "continuous_batching": true,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 4,
+    "max_context_length": 4096,
+    "max_topk": 256,
+    "n_active_tokens": 4096,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 4096,
+    "sequence_parallel_enabled": false,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "no_rope_layer_interval": 4,
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "sliding_window": null,
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/71933d2c6e6099113f93.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/71933d2c6e6099113f93.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bc54bdacc72f7f8eb770c1722956ba8c1808960
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.4.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/71933d2c6e6099113f93.json
@@ -0,0 +1,135 @@
+{
+  "_entry_class": "SingleModelCacheEntry",
+  "_model_id": "HuggingFaceTB/SmolLM3-3B",
+  "_task": "text-generation",
+  "architectures": [
+    "SmolLM3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 65536,
+  "max_window_layers": 28,
+  "mlp_bias": false,
+  "model_type": "smollm3",
+  "neuron": {
+    "_serialized_key": "NxDNeuronConfig",
+    "batch_size": 1,
+    "capacity_factor": null,
+    "checkpoint_id": "HuggingFaceTB/SmolLM3-3B",
+    "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1",
+    "continuous_batching": false,
+    "ep_degree": 1,
+    "fused_qkv": true,
+    "glu_mlp": true,
+    "local_ranks_size": 2,
+    "max_batch_size": 1,
+    "max_context_length": 8192,
+    "max_topk": 256,
+    "n_active_tokens": 8192,
+    "neuronxcc_version": "2.21.33363.0+82129205",
+    "on_device_sampling": true,
+    "optimum_neuron_version": "0.4.4.dev2",
+    "output_logits": false,
+    "pp_degree": 1,
+    "sequence_length": 8192,
+    "sequence_parallel_enabled": true,
+    "speculation_length": 0,
+    "start_rank_id": 0,
+    "target": "trn1",
+    "torch_dtype": "bfloat16",
+    "tp_degree": 2
+  },
+  "no_rope_layer_interval": 4,
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0,
+    1,
+    1,
+    1,
+    0
+  ],
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000.0,
+  "sliding_window": null,
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 128256
+}
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..5ff1cca947e85af8100f02a971940353805dacbd
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60325dbe4e341d0836f3b68cedc6beaa300459af2c947445da4a545a3f703c10
+size 939272
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..9cb4e7221016e53bbc4717c93186930ab43c066b
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_110fc80e89006393f738+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e189bba6f572c67c0bd6e9bba05b58d4283f34c50588dd6aa97a34007311ba28
+size 6667264
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..caa215de9ecc819b26539ea04f6d7b719b525bba
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d18c81c6ab7ffca4593ffd12280271d86ca9a2700a2770da93fb314a9109ff67
+size 509380
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..52e5de160ae6eb6f0bbcc4df34d8809b957ea935
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_196d1b2148ed8629b154+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45d13b5a6f47ee0e1c7c9b783470687dfcad53929b4035be0b071bbacd718948
+size 41585664
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..be719c4eee874db0bca312c82d2be934c3caba4e
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ed4ca4ead0b4a889a31f35680291eb4368e838532e40337ccf0a72e56de3aa1
+size 914383
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..5ff399909d9f0520b83d7ac0d957b9989dda8ffb
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_1aa5f4baa9354745d6a6+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4a8ac13c543cfd26597bdef4bb0dc1c5fe6dafe9046db1c62b7d4e1fe1eb4bd
+size 12606464
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..d0b81853a05cdcc68ffd73e6a3e9d08e6e78a9bf
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b254a5de285d4202654c3861698c1a55558bbae98505f678539066015a5d803a
+size 474402
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..4514fda120c3b85899add1e7948dbdd5a4edfa3e
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_37d321becc90cb687039+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02b5479a35276997f52e0427e90139634e919e308bc9747f3ca820dec15d0382
+size 115057664
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..74a51ce539ef7bf362be1fd8562d4d8c44949119
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bcd5dc785b88fb271b709071dcba90051a1052d0548e3eb0dfd4d88940f24d8
+size 1036312
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..450ecc8be80d5bbb411270715c328a3e681110e2
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_39061f1efbca2332dc73+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61e6e0d539f585138ff57fe8510d76ba99dc0f679c7886ffa295bb8396c1bd9f
+size 15893504
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..5726abc7d1d8c52fa95bc7919439a23a23fe3b9a
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..0f9d614d6fbfc4fedae88878100cd8c56775eb34
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92413edeab8bdd1ff79c68aeeb0633faf78e28df1df24ce6e6685975d7156ff1
+size 426769
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..b0c5bb7044ce4121848c7cce0cd1761f202131a0
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c238600da6a768547204d176f9f7adf55db06bcaf6e1cc9a978896b92c42e21
+size 3073024
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/wrapped_neff.hlo b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..e507f94b047ecc7bc3a351e6f361a5c450ee27eb
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5cfe268b844f7d2286a5+a02c3a36/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54e033c437c96db2869f8b8620b5b2eb24b6fad3e04fd4f0b417f6df1c3558ef
+size 3147125
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..cc31afd593bab43dcdb6fcf84b0b77140dc48669
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afbf1c2497a38503bf2c1c9685d3d551b5461b11c9eedd45100bee098d75959d
+size 978844
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..fa0e1b0c0391ef035fef648a0597fbf582e0a8d4
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_7c1ab0225123c184780f+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8741039102da2965de796bf40e6538a49f53e0cebf245a4a5d30530136794056
+size 6718464
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e638b5c7cceb8e7260606f893839002ebcc6cdd1
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a524a8593eaa11d4c64b9633ad252e08702bde8a7a43768a07e75baf9423c7cb
+size 1049377
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..e3bd3ef7a392bb9414cf9adef02bd9f42bd18f86
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_88399bbf2a34b1e28eee+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db85008650868e4f75f4a65d80144ea6b7e547796f3ec4fe4e3b666c0457657a
+size 23000064
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..5726abc7d1d8c52fa95bc7919439a23a23fe3b9a
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..7d4ffbf8209549efc7988afdcab7f584f5903097
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2560de39994ba4784626257a4845c5ad7af11cf2ce79e7df7ac1e29ba15f33ec
+size 434264
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..6a694d2da58b8a6f6c259b838b5084093ca8f877
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c2586694f84bb7527a209850d0dd49d48990faacd1d4e4c72602d5e32573172
+size 2366464
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/wrapped_neff.hlo b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..44df764c07e874c668ad74d61b57ec1545660506
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9b8221eb10f6b4eb2f68+a02c3a36/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce92ab94bac6c24d2adf4d57bc2afbacfc4ec63abecfe9b3a06f395d6077f1ed
+size 2440551
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..836724f44545ce0dedda1521fd4c623a6ea8ec72
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/context_encoding/_tp0_bk0/log-neuron-cc.txt"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..fb2212bee3cacbebd86a3a4b2b98d2e9ecb64f89
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c21388ebcbb2ad22728e2aad4f2f7eb57d3c8a04c59a81df97f9e39aa18f57b5
+size 507614
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..77e864ed0e4d66df411cb76423dd652185d8629e
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_a6d912262b31e81edfe6+24129607/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0ebe7aa5e1ba33d7e2cb909b8df9ee8d25f1f1878a4dde47c74622a01bd3d37
+size 114013184
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..5726abc7d1d8c52fa95bc7919439a23a23fe3b9a
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..26150d061de2289f7d9c44e931b05a75bb2d1b55
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8406378e6e18cc6ac3eda37f281d580f7b4bbec015dc27ac8369fc44b705526f
+size 809145
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..1ec6b53b83b27906e17be8e1914e4985afe90286
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a3a82e0d0626a80a2c5e6898d41e9ca22a72fea4b945f1555508c37787c263
+size 21863424
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/wrapped_neff.hlo b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..94896c041d86842ff8fc1a2c3731c13ac84c8953
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_cfd70138eb9722ac2255+a02c3a36/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ec03e8b7d7deca9b162d16f1d57b4954f12796e59c0cd1e9bd7edbca49eff15
+size 22011354
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/compile_flags.json
new file mode 100644
index 0000000000000000000000000000000000000000..5726abc7d1d8c52fa95bc7919439a23a23fe3b9a
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/compile_flags.json
@@ -0,0 +1 @@
+["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/token_generation/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
\ No newline at end of file
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.done
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.hlo_module.pb
new file mode 100644
index 0000000000000000000000000000000000000000..c1bf707e116369c98d18265e77ad5c232115e427
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.hlo_module.pb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f057068b35f8aeb0d565f76649c32016184243a96809f860114d903a6770d7f
+size 770051
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.neff
new file mode 100644
index 0000000000000000000000000000000000000000..c2a4624ec46d0a3da5610fef4aa61252cdc1d794
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/model.neff
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:695538f706036d1034f9ea8fb7269338dba1a78a37fc131646f03c7496f1239f
+size 21801984
diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/wrapped_neff.hlo b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/wrapped_neff.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..bbc75ed359ab77dd0d8fd84ec0a78b27d2ae7821
--- /dev/null
+++ b/neuronxcc-2.21.33363.0+82129205/MODULE_dd366124647aeec64074+a02c3a36/wrapped_neff.hlo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dca44e3cebd5178ce615300b84bd8ad2e980993e5fc672a7bb53b6576a4e981
+size 21949799