jburtoft commited on
Commit
0312d1a
·
verified ·
1 Parent(s): 7538c5c

Trinity-Nano compiled for Neuron TP=1 BS=1 seq_len=2048 (SDK 2.28, pre-sharded weights)

Browse files
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8151b534bf4a6983ada3193285236c6e00698f33a847e72cc3b755694a5191ad
3
+ size 50660107
neuron_config.json ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "attention_dropout": 0.0,
4
+ "bos_token_id": null,
5
+ "dense_intermediate_size": 3072,
6
+ "eos_token_id": null,
7
+ "fused_spec_config": null,
8
+ "global_attn_every_n_layers": 4,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 256,
14
+ "layer_types": [
15
+ "sliding_attention",
16
+ "sliding_attention",
17
+ "sliding_attention",
18
+ "full_attention",
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "full_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "sliding_attention",
26
+ "full_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "full_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "full_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "full_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "full_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "full_attention",
59
+ "sliding_attention",
60
+ "sliding_attention",
61
+ "sliding_attention",
62
+ "full_attention",
63
+ "sliding_attention",
64
+ "sliding_attention",
65
+ "sliding_attention",
66
+ "full_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "sliding_attention",
70
+ "full_attention"
71
+ ],
72
+ "load_balance_coeff": 0.001,
73
+ "max_position_embeddings": 131072,
74
+ "metadata": null,
75
+ "moe_intermediate_size": 256,
76
+ "mup_enabled": true,
77
+ "n_group": 1,
78
+ "n_shared_experts": 0,
79
+ "neuron_config": {
80
+ "activation_quantization_type": null,
81
+ "allow_input_truncation": false,
82
+ "apply_seq_ids_mask": false,
83
+ "async_mode": false,
84
+ "attention_dp_degree": 1,
85
+ "attention_dtype": null,
86
+ "attn_block_cte_nki_kernel_enabled": false,
87
+ "attn_block_tkg_nki_kernel_cache_update": false,
88
+ "attn_block_tkg_nki_kernel_cascaded_attention": false,
89
+ "attn_block_tkg_nki_kernel_disable_gpsimd_sb2sb": false,
90
+ "attn_block_tkg_nki_kernel_enabled": false,
91
+ "attn_block_tkg_nki_kernel_use_online_softmax": true,
92
+ "attn_cls": "NeuronLlamaAttention",
93
+ "attn_kernel_enabled": null,
94
+ "attn_tkg_builtin_kernel_enabled": false,
95
+ "attn_tkg_nki_kernel_enabled": false,
96
+ "batch_size": 1,
97
+ "blockwise_matmul_config": {
98
+ "always_augment_inputs_for_blockwise_matmul": false,
99
+ "block_sharding_strategy": {
100
+ "__objclass__": {
101
+ "__module__": "neuronxcc.nki._pre_prod_kernels.blockwise_mm",
102
+ "__name__": "BlockShardStrategy"
103
+ },
104
+ "_name_": "HI_LO",
105
+ "_sort_order_": 0,
106
+ "_value_": 0
107
+ },
108
+ "block_size": 512,
109
+ "blockwise_nki_autograd_cls": null,
110
+ "logical_nc_config": {
111
+ "__objclass__": {
112
+ "__module__": "neuronx_distributed.utils.model_utils",
113
+ "__name__": "LogicalNCConfig"
114
+ },
115
+ "_name_": "LNC_1",
116
+ "_sort_order_": 0,
117
+ "_value_": 1
118
+ },
119
+ "num_static_blocks": null,
120
+ "optimized_block_to_token_mapping": true,
121
+ "pad_num_blocks_to_even": false,
122
+ "parallelize_token_to_block_mapping": true,
123
+ "skip_dma_token": false,
124
+ "skip_dma_weight": false,
125
+ "use_block_parallel": false,
126
+ "use_shard_on_block_dynamic_while": false,
127
+ "use_shard_on_intermediate_dynamic_while": false,
128
+ "use_torch_block_wise": false
129
+ },
130
+ "bucket_n_active_tokens": false,
131
+ "buckets": [
132
+ 2048
133
+ ],
134
+ "capacity_factor": null,
135
+ "cast_type": "config",
136
+ "cc_pipeline_tiling_factor": 2,
137
+ "chunked_prefill_config": null,
138
+ "context_encoding_buckets": null,
139
+ "cp_degree": 1,
140
+ "ctx_batch_size": 1,
141
+ "disable_argmax_kernel": false,
142
+ "disable_kv_cache_tiling": false,
143
+ "disable_numeric_cc_token": false,
144
+ "dma_order_config": null,
145
+ "draft_model_modules_to_not_convert": null,
146
+ "eagle_rolling_buffer_kernel_enabled": false,
147
+ "early_expert_affinity_modulation": false,
148
+ "enable_bucketing": false,
149
+ "enable_cte_modular_flow": false,
150
+ "enable_eagle_draft_input_norm": false,
151
+ "enable_eagle_speculation": false,
152
+ "enable_fused_speculation": false,
153
+ "enable_long_context_mode": false,
154
+ "enable_output_completion_notifications": false,
155
+ "enable_spill_reload_dge": false,
156
+ "enable_token_tree": false,
157
+ "ep_degree": 1,
158
+ "ep_dispatch_cc_option": "AR_AG",
159
+ "expert_mlp_nki_kernel_enabled": null,
160
+ "flash_decoding_enabled": false,
161
+ "fused_qkv": false,
162
+ "fused_rmsnorm_skip_gamma": false,
163
+ "fused_shared_experts": false,
164
+ "gate_clamp_lower_limit": null,
165
+ "gate_clamp_upper_limit": null,
166
+ "glu_mlp": true,
167
+ "glu_type": "glu",
168
+ "hidden_act_bias": 0.0,
169
+ "hidden_act_scaling_factor": 1.0,
170
+ "hybrid_sharding_config": null,
171
+ "is_block_kv_layout": false,
172
+ "is_chunked_prefill": false,
173
+ "is_continuous_batching": false,
174
+ "is_eagle3": false,
175
+ "is_eagle_draft": false,
176
+ "is_full_model_shuffled": false,
177
+ "is_hidden_dim_shuffled": false,
178
+ "is_intermediate_dim_shuffled": false,
179
+ "is_medusa": false,
180
+ "is_mxfp4_compute": false,
181
+ "is_prefill_stage": null,
182
+ "is_prefix_caching": false,
183
+ "k_cache_transposed": false,
184
+ "kv_cache_batch_size": 1,
185
+ "kv_cache_padding_size": 0,
186
+ "kv_cache_quant": false,
187
+ "kv_cache_tiling": false,
188
+ "kv_cache_update_with_kernel": false,
189
+ "kv_quant_config": null,
190
+ "layer_boundary_markers": false,
191
+ "lm_head_pad": false,
192
+ "lm_head_pad_alignment_size": 1,
193
+ "local_ranks_size": 1,
194
+ "logical_nc_config": 1,
195
+ "lora_config": null,
196
+ "max_batch_size": 1,
197
+ "max_context_length": 2048,
198
+ "max_length": 2048,
199
+ "max_new_tokens": null,
200
+ "medusa_speculation_length": 0,
201
+ "medusa_tree": null,
202
+ "mlp_cp_degree": 1,
203
+ "mlp_kernel_enabled": false,
204
+ "mlp_kernel_fuse_residual_add": false,
205
+ "mlp_tkg_nki_kernel_enabled": false,
206
+ "modules_to_not_convert": null,
207
+ "moe_ep_degree": 1,
208
+ "moe_fused_nki_kernel_enabled": null,
209
+ "moe_mask_padded_tokens": false,
210
+ "moe_tp_degree": 1,
211
+ "n_active_tokens": 2048,
212
+ "n_positions": 2048,
213
+ "normalize_top_k_affinities": true,
214
+ "num_medusa_heads": 0,
215
+ "on_cpu": false,
216
+ "on_device_sampling_config": null,
217
+ "out_proj_kernel_enabled": false,
218
+ "output_logits": false,
219
+ "overrides_torch_dtype": true,
220
+ "pa_block_size": 2048,
221
+ "pa_num_blocks": 1,
222
+ "padded_hidden_size": null,
223
+ "padded_intermediate_size": null,
224
+ "padding_side": "right",
225
+ "pp_degree": 1,
226
+ "pre_rope_rmsnorm": false,
227
+ "prefix_buckets": null,
228
+ "qk_layernorm": false,
229
+ "qkv_cte_nki_kernel_fuse_rope": false,
230
+ "qkv_kernel_enabled": false,
231
+ "qkv_kernel_fuse_residual_add": false,
232
+ "qkv_kernel_nbsd_layout": false,
233
+ "qkv_nki_kernel_enabled": false,
234
+ "quantization_block_axis": null,
235
+ "quantization_block_size": null,
236
+ "quantization_dtype": "int8",
237
+ "quantization_scale_dtype": "f32",
238
+ "quantization_type": "per_tensor_symmetric",
239
+ "quantize_clamp_bound": Infinity,
240
+ "quantized": false,
241
+ "quantized_checkpoints_path": null,
242
+ "quantized_mlp_kernel_enabled": false,
243
+ "return_expert_index": false,
244
+ "return_router_logits": false,
245
+ "rmsnorm_quantize_kernel_enabled": false,
246
+ "router_config": {
247
+ "act_fn": "sigmoid",
248
+ "dtype": "float32"
249
+ },
250
+ "router_topk_nki_kernel_enabled": null,
251
+ "rpl_reduce_dtype": null,
252
+ "save_sharded_checkpoint": true,
253
+ "scratchpad_page_size": null,
254
+ "seq_len": 2048,
255
+ "seq_len_threshold_for_cc_tiling": 16384,
256
+ "sequence_parallel_enabled": false,
257
+ "shared_experts_sequence_parallel_enabled": false,
258
+ "shared_mlp_nki_kernel_enabled": null,
259
+ "skip_sharding": false,
260
+ "skip_warmup": false,
261
+ "spec_batch_size": 1,
262
+ "speculation_length": 0,
263
+ "start_rank_id": 0,
264
+ "strided_context_parallel_kernel_enabled": false,
265
+ "switch_cc": false,
266
+ "target": null,
267
+ "tensor_capture_config": null,
268
+ "tensor_replacement_config": null,
269
+ "tile_cc": false,
270
+ "tkg_batch_size": 1,
271
+ "token_generation_batches": null,
272
+ "token_generation_buckets": null,
273
+ "token_tree_config": null,
274
+ "torch_dtype": "bfloat16",
275
+ "tp_degree": 1,
276
+ "transpose_shared_experts_weights": false,
277
+ "up_clamp_lower_limit": null,
278
+ "up_clamp_upper_limit": null,
279
+ "use_index_calc_kernel": false,
280
+ "vocab_parallel": false,
281
+ "weight_gather_seq_len_threshold": 32768,
282
+ "weights_to_skip_layout_optimization": [],
283
+ "windowed_context_encoding_size": null,
284
+ "world_size": 1
285
+ },
286
+ "num_attention_heads": 8,
287
+ "num_cores_per_group": 1,
288
+ "num_dense_layers": 2,
289
+ "num_expert_groups": 1,
290
+ "num_experts": 128,
291
+ "num_experts_per_tok": 8,
292
+ "num_hidden_layers": 56,
293
+ "num_key_value_heads": 2,
294
+ "num_limited_groups": 1,
295
+ "num_local_experts": 128,
296
+ "num_shared_experts": 1,
297
+ "output_attentions": false,
298
+ "output_hidden_states": false,
299
+ "pad_token_id": null,
300
+ "rms_norm_eps": 1e-05,
301
+ "rope_scaling": null,
302
+ "rope_theta": 10000,
303
+ "route_norm": true,
304
+ "route_scale": 2.826,
305
+ "score_func": "sigmoid",
306
+ "sliding_window": 2048,
307
+ "tie_word_embeddings": false,
308
+ "topk_group": 1,
309
+ "torch_dtype": "bfloat16",
310
+ "use_cache": true,
311
+ "vocab_size": 200192
312
+ }
weights/tp0_sharded_checkpoint.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:113b1eca4ea47fced2e333b79daa470b01e12fc4c64820cb977d5e6473275bc1
3
+ size 12240134252