diff --git "a/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil" "b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil"
new file mode 100644--- /dev/null
+++ "b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil"
@@ -0,0 +1,1377 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> cache_length, tensor<fp16, [1, 2048, 1, 1]> input_embeds, tensor<fp16, [1, 5120, 1, 16]> key_cache, tensor<fp16, [1, 16]> key_padding_mask, tensor<fp16, [1, 16]> kv_cache_update_mask, tensor<fp16, [1, 5120, 1, 16]> value_cache) {
+            string inputs_1_pad_type_0 = const()[name = string("inputs_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> inputs_1_strides_0 = const()[name = string("inputs_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> inputs_1_pad_0 = const()[name = string("inputs_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> inputs_1_dilations_0 = const()[name = string("inputs_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 inputs_1_groups_0 = const()[name = string("inputs_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> input_projection_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2097280))))[name = string("input_projection_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> input_projection_bias_to_fp16 = const()[name = string("input_projection_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2097856)))];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_1_cast_fp16 = conv(bias = input_projection_bias_to_fp16, dilations = inputs_1_dilations_0, groups = inputs_1_groups_0, pad = inputs_1_pad_0, pad_type = inputs_1_pad_type_0, strides = inputs_1_strides_0, weight = input_projection_weight_to_fp16_palettized, x = input_embeds)[name = string("inputs_1_cast_fp16")];
+            int32 pos_cos_batch_dims_0 = const()[name = string("pos_cos_batch_dims_0"), val = int32(0)];
+            bool pos_cos_validate_indices_0 = const()[name = string("pos_cos_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_cos_weight_to_fp16 = const()[name = string("position_embeddings_cos_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2099968)))];
+            string cache_length_to_int16_dtype_0 = const()[name = string("cache_length_to_int16_dtype_0"), val = string("int16")];
+            string cast_111_dtype_0 = const()[name = string("cast_111_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> cache_length_to_int16 = cast(dtype = cache_length_to_int16_dtype_0, x = cache_length)[name = string("cast_5")];
+            tensor<int32, [1]> cast_111 = cast(dtype = cast_111_dtype_0, x = cache_length_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_111, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(16)];
+            tensor<int32, [1]> add_0 = add(x = cast_111, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_111, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            string cast_0_dtype_0 = const()[name = string("cast_0_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<int32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = select_0_to_int16)[name = string("cast_2")];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = cast_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(16)];
+            tensor<int32, [1]> add_0_1 = add(x = cast_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = cast_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0 = const()[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 128]> pos_cos_cast_fp16_cast_uint16_cast_uint16 = gather(axis = pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0, batch_dims = pos_cos_batch_dims_0, indices = select_0_1, validate_indices = pos_cos_validate_indices_0, x = position_embeddings_cos_weight_to_fp16)[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> obj_7_axes_0 = const()[name = string("obj_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_7_cast_fp16 = expand_dims(axes = obj_7_axes_0, x = pos_cos_cast_fp16_cast_uint16_cast_uint16)[name = string("obj_7_cast_fp16")];
+            int32 pos_sin_axis_0 = const()[name = string("pos_sin_axis_0"), val = int32(0)];
+            int32 pos_sin_batch_dims_0 = const()[name = string("pos_sin_batch_dims_0"), val = int32(0)];
+            bool pos_sin_validate_indices_0 = const()[name = string("pos_sin_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_sin_weight_to_fp16 = const()[name = string("position_embeddings_sin_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2104128)))];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_1")];
+            tensor<fp16, [1, 128]> pos_sin_cast_fp16_cast_uint16 = gather(axis = pos_sin_axis_0, batch_dims = pos_sin_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_validate_indices_0, x = position_embeddings_sin_weight_to_fp16)[name = string("pos_sin_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> obj_9_axes_0 = const()[name = string("obj_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_9_cast_fp16 = expand_dims(axes = obj_9_axes_0, x = pos_sin_cast_fp16_cast_uint16)[name = string("obj_9_cast_fp16")];
+            tensor<int32, [5]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_96_axis_0 = const()[name = string("op_96_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_4 = split(axis = var_96_axis_0, split_sizes = tile_0, x = key_cache)[name = string("op_96_cast_fp16")];
+            tensor<int32, [5]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_104_axis_0 = const()[name = string("op_104_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_4 = split(axis = var_104_axis_0, split_sizes = tile_1, x = value_cache)[name = string("op_104_cast_fp16")];
+            int32 var_111 = const()[name = string("op_111"), val = int32(3)];
+            int32 var_121 = const()[name = string("op_121"), val = int32(-2)];
+            int32 var_129 = const()[name = string("op_129"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_141_to_fp16 = const()[name = string("op_141_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_142_cast_fp16 = add(x = variance_1_cast_fp16, y = var_141_to_fp16)[name = string("op_142_cast_fp16")];
+            fp32 var_143_epsilon_0 = const()[name = string("op_143_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_143_cast_fp16 = rsqrt(epsilon = var_143_epsilon_0, x = var_142_cast_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_143_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2108288)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_1_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("obj_1_cast_fp16")];
+            string query_1_pad_type_0 = const()[name = string("query_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_1_strides_0 = const()[name = string("query_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_1_pad_0 = const()[name = string("query_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_1_dilations_0 = const()[name = string("query_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_1_groups_0 = const()[name = string("query_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2110400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4207616))))[name = string("layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> layers_0_self_attn_q_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_q_proj_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4208192)))];
+            tensor<fp16, [1, 2048, 1, 1]> query_1_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_1_dilations_0, groups = query_1_groups_0, pad = query_1_pad_0, pad_type = query_1_pad_type_0, strides = query_1_strides_0, weight = layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("query_1_cast_fp16")];
+            string current_key_1_pad_type_0 = const()[name = string("current_key_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_1_strides_0 = const()[name = string("current_key_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_1_pad_0 = const()[name = string("current_key_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_1_dilations_0 = const()[name = string("current_key_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_1_groups_0 = const()[name = string("current_key_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4212352))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5260992))))[name = string("layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = conv(dilations = current_key_1_dilations_0, groups = current_key_1_groups_0, pad = current_key_1_pad_0, pad_type = current_key_1_pad_type_0, strides = current_key_1_strides_0, weight = layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            string current_value_1_pad_type_0 = const()[name = string("current_value_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_1_strides_0 = const()[name = string("current_value_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_1_pad_0 = const()[name = string("current_value_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_1_dilations_0 = const()[name = string("current_value_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_1_groups_0 = const()[name = string("current_value_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5261568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6310208))))[name = string("layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> layers_0_self_attn_v_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_v_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6310784)))];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_1_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_1_dilations_0, groups = current_value_1_groups_0, pad = current_value_1_pad_0, pad_type = current_value_1_pad_type_0, strides = current_value_1_strides_0, weight = layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_value_1_cast_fp16")];
+            tensor<int32, [4]> var_180 = const()[name = string("op_180"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_3_cast_fp16 = reshape(shape = var_180, x = query_1_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_186_to_fp16 = const()[name = string("op_186_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_187_cast_fp16 = add(x = variance_3_cast_fp16, y = var_186_to_fp16)[name = string("op_187_cast_fp16")];
+            fp32 var_188_epsilon_0 = const()[name = string("op_188_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_188_cast_fp16 = rsqrt(epsilon = var_188_epsilon_0, x = var_187_cast_fp16)[name = string("op_188_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_188_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6312896)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_1_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("query_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_196 = const()[name = string("op_196"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_5_cast_fp16 = reshape(shape = var_196, x = current_key_1_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_202_to_fp16 = const()[name = string("op_202_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_203_cast_fp16 = add(x = variance_5_cast_fp16, y = var_202_to_fp16)[name = string("op_203_cast_fp16")];
+            fp32 var_204_epsilon_0 = const()[name = string("op_204_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_204_cast_fp16 = rsqrt(epsilon = var_204_epsilon_0, x = var_203_cast_fp16)[name = string("op_204_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_204_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6313216)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_1_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("current_key_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_222 = const()[name = string("op_222"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_1_cast_fp16 = reshape(shape = var_222, x = query_normed_1_cast_fp16)[name = string("mh_q_1_cast_fp16")];
+            tensor<int32, [4]> var_224 = const()[name = string("op_224"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_1_cast_fp16 = reshape(shape = var_224, x = current_key_normed_1_cast_fp16)[name = string("mh_k_1_cast_fp16")];
+            tensor<int32, [1]> cos_1_axes_0 = const()[name = string("cos_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> cos_1_cast_fp16 = expand_dims(axes = cos_1_axes_0, x = obj_7_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [1]> sin_1_axes_0 = const()[name = string("sin_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> sin_1_cast_fp16 = expand_dims(axes = sin_1_axes_0, x = obj_9_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_228_cast_fp16 = mul(x = mh_q_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<int32, [4]> var_233_begin_0 = const()[name = string("op_233_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_233_end_0 = const()[name = string("op_233_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_233_end_mask_0 = const()[name = string("op_233_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_233_cast_fp16 = slice_by_index(begin = var_233_begin_0, end = var_233_end_0, end_mask = var_233_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_233_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = string("op_239_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = string("op_239_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = string("op_239_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_239_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_241_cast_fp16 = mul(x = var_239_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_241_cast_fp16")];
+            bool var_243_interleave_0 = const()[name = string("op_243_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_243_cast_fp16 = concat(axis = var_121, interleave = var_243_interleave_0, values = (var_241_cast_fp16, var_233_cast_fp16))[name = string("op_243_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_244_cast_fp16 = mul(x = var_243_cast_fp16, y = sin_1_cast_fp16)[name = string("op_244_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_3_cast_fp16 = add(x = var_228_cast_fp16, y = var_244_cast_fp16)[name = string("mh_q_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_246_cast_fp16 = mul(x = mh_k_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_246_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = string("op_251_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = string("op_251_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = string("op_251_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_251_cast_fp16")];
+            tensor<int32, [4]> var_257_begin_0 = const()[name = string("op_257_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_257_end_0 = const()[name = string("op_257_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_257_end_mask_0 = const()[name = string("op_257_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_257_cast_fp16 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_257_cast_fp16")];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_259_cast_fp16 = mul(x = var_257_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_259_cast_fp16")];
+            bool var_261_interleave_0 = const()[name = string("op_261_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_261_cast_fp16 = concat(axis = var_121, interleave = var_261_interleave_0, values = (var_259_cast_fp16, var_251_cast_fp16))[name = string("op_261_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_262_cast_fp16 = mul(x = var_261_cast_fp16, y = sin_1_cast_fp16)[name = string("op_262_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_3_cast_fp16 = add(x = var_246_cast_fp16, y = var_262_cast_fp16)[name = string("mh_k_3_cast_fp16")];
+            tensor<int32, [4]> var_266 = const()[name = string("op_266"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_266, x = mh_k_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<int32, [1]> var_269_axes_0 = const()[name = string("op_269_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_269_cast_fp16 = expand_dims(axes = var_269_axes_0, x = kv_cache_update_mask)[name = string("op_269_cast_fp16")];
+            tensor<int32, [1]> var_270_axes_0 = const()[name = string("op_270_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_270_cast_fp16 = expand_dims(axes = var_270_axes_0, x = var_269_cast_fp16)[name = string("op_270_cast_fp16")];
+            fp16 var_122_to_fp16 = const()[name = string("op_122_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 16]> var_272_cast_fp16 = sub(x = var_122_to_fp16, y = var_270_cast_fp16)[name = string("op_272_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_273_cast_fp16 = mul(x = var_96_cast_fp16_0, y = var_272_cast_fp16)[name = string("op_273_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_274_cast_fp16 = mul(x = current_key_3_cast_fp16, y = var_270_cast_fp16)[name = string("op_274_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_3_cast_fp16 = add(x = var_273_cast_fp16, y = var_274_cast_fp16)[name = string("key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_277_cast_fp16 = mul(x = var_104_cast_fp16_0, y = var_272_cast_fp16)[name = string("op_277_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_278_cast_fp16 = mul(x = current_value_1_cast_fp16, y = var_270_cast_fp16)[name = string("op_278_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_1_cast_fp16 = add(x = var_277_cast_fp16, y = var_278_cast_fp16)[name = string("value_1_cast_fp16")];
+            tensor<int32, [4]> var_282 = const()[name = string("op_282"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_1_cast_fp16 = reshape(shape = var_282, x = key_3_cast_fp16)[name = string("key_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_284 = const()[name = string("op_284"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_1_cast_fp16 = reshape(shape = var_284, x = value_1_cast_fp16)[name = string("value_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_287_begin_0 = const()[name = string("op_287_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_287_end_0 = const()[name = string("op_287_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_287_end_mask_0 = const()[name = string("op_287_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_287_cast_fp16 = slice_by_index(begin = var_287_begin_0, end = var_287_end_0, end_mask = var_287_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_287_cast_fp16")];
+            tensor<int32, [4]> var_291_begin_0 = const()[name = string("op_291_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_291_end_0 = const()[name = string("op_291_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_291_end_mask_0 = const()[name = string("op_291_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_291_cast_fp16 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_291_cast_fp16")];
+            tensor<int32, [4]> var_303_begin_0 = const()[name = string("op_303_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_303_end_0 = const()[name = string("op_303_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_303_end_mask_0 = const()[name = string("op_303_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_303_cast_fp16 = slice_by_index(begin = var_303_begin_0, end = var_303_end_0, end_mask = var_303_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_303_cast_fp16")];
+            tensor<int32, [4]> var_307_begin_0 = const()[name = string("op_307_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_307_end_0 = const()[name = string("op_307_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_307_end_mask_0 = const()[name = string("op_307_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_307_cast_fp16 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_307_cast_fp16")];
+            tensor<int32, [4]> var_319_begin_0 = const()[name = string("op_319_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_319_end_0 = const()[name = string("op_319_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_319_end_mask_0 = const()[name = string("op_319_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_319_cast_fp16 = slice_by_index(begin = var_319_begin_0, end = var_319_end_0, end_mask = var_319_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_319_cast_fp16")];
+            tensor<int32, [4]> var_323_begin_0 = const()[name = string("op_323_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_323_end_0 = const()[name = string("op_323_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_323_end_mask_0 = const()[name = string("op_323_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_323_cast_fp16 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_323_cast_fp16")];
+            tensor<int32, [4]> var_335_begin_0 = const()[name = string("op_335_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_335_end_0 = const()[name = string("op_335_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_335_end_mask_0 = const()[name = string("op_335_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_335_cast_fp16 = slice_by_index(begin = var_335_begin_0, end = var_335_end_0, end_mask = var_335_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_335_cast_fp16")];
+            tensor<int32, [4]> var_339_begin_0 = const()[name = string("op_339_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_339_end_0 = const()[name = string("op_339_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_339_end_mask_0 = const()[name = string("op_339_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_339_cast_fp16 = slice_by_index(begin = var_339_begin_0, end = var_339_end_0, end_mask = var_339_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_339_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_351_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_355_cast_fp16")];
+            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_367_cast_fp16")];
+            tensor<int32, [4]> var_371_begin_0 = const()[name = string("op_371_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_371_end_0 = const()[name = string("op_371_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_371_end_mask_0 = const()[name = string("op_371_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_371_cast_fp16 = slice_by_index(begin = var_371_begin_0, end = var_371_end_0, end_mask = var_371_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_371_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = string("op_383_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = string("op_383_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = string("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_383_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_387_cast_fp16 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_387_cast_fp16")];
+            tensor<int32, [4]> var_399_begin_0 = const()[name = string("op_399_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_399_end_0 = const()[name = string("op_399_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_399_end_mask_0 = const()[name = string("op_399_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_399_cast_fp16 = slice_by_index(begin = var_399_begin_0, end = var_399_end_0, end_mask = var_399_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_399_cast_fp16")];
+            tensor<int32, [4]> var_403_begin_0 = const()[name = string("op_403_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_403_end_0 = const()[name = string("op_403_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_403_end_mask_0 = const()[name = string("op_403_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_403_cast_fp16 = slice_by_index(begin = var_403_begin_0, end = var_403_end_0, end_mask = var_403_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_403_cast_fp16")];
+            bool key_heads_3_interleave_0 = const()[name = string("key_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_3_cast_fp16 = concat(axis = var_129, interleave = key_heads_3_interleave_0, values = (var_287_cast_fp16, var_287_cast_fp16, var_303_cast_fp16, var_303_cast_fp16, var_319_cast_fp16, var_319_cast_fp16, var_335_cast_fp16, var_335_cast_fp16, var_351_cast_fp16, var_351_cast_fp16, var_367_cast_fp16, var_367_cast_fp16, var_383_cast_fp16, var_383_cast_fp16, var_399_cast_fp16, var_399_cast_fp16))[name = string("key_heads_3_cast_fp16")];
+            bool value_heads_3_interleave_0 = const()[name = string("value_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_3_cast_fp16 = concat(axis = var_129, interleave = value_heads_3_interleave_0, values = (var_291_cast_fp16, var_291_cast_fp16, var_307_cast_fp16, var_307_cast_fp16, var_323_cast_fp16, var_323_cast_fp16, var_339_cast_fp16, var_339_cast_fp16, var_355_cast_fp16, var_355_cast_fp16, var_371_cast_fp16, var_371_cast_fp16, var_387_cast_fp16, var_387_cast_fp16, var_403_cast_fp16, var_403_cast_fp16))[name = string("value_heads_3_cast_fp16")];
+            fp16 var_426_to_fp16 = const()[name = string("op_426_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_427_cast_fp16 = mul(x = mh_q_3_cast_fp16, y = var_426_to_fp16)[name = string("op_427_cast_fp16")];
+            bool mh_w_1_transpose_x_0 = const()[name = string("mh_w_1_transpose_x_0"), val = bool(true)];
+            bool mh_w_1_transpose_y_0 = const()[name = string("mh_w_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_1_cast_fp16 = matmul(transpose_x = mh_w_1_transpose_x_0, transpose_y = mh_w_1_transpose_y_0, x = var_427_cast_fp16, y = key_heads_3_cast_fp16)[name = string("mh_w_1_cast_fp16")];
+            tensor<int32, [1]> var_435_axes_0 = const()[name = string("op_435_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_435_cast_fp16 = expand_dims(axes = var_435_axes_0, x = key_padding_mask)[name = string("op_435_cast_fp16")];
+            tensor<int32, [1]> var_436_axes_0 = const()[name = string("op_436_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_436_cast_fp16 = expand_dims(axes = var_436_axes_0, x = var_435_cast_fp16)[name = string("op_436_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_3_cast_fp16 = add(x = mh_w_1_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_3_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_439_cast_fp16 = softmax(axis = var_111, x = mh_w_3_cast_fp16)[name = string("op_439_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = value_heads_3_cast_fp16, y = var_439_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_444 = const()[name = string("op_444"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = reshape(shape = var_444, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            string obj_11_pad_type_0 = const()[name = string("obj_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_11_strides_0 = const()[name = string("obj_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_11_pad_0 = const()[name = string("obj_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_11_dilations_0 = const()[name = string("obj_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_11_groups_0 = const()[name = string("obj_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6313536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8410752))))[name = string("layers_0_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_11_dilations_0, groups = obj_11_groups_0, pad = obj_11_pad_0, pad_type = obj_11_pad_type_0, strides = obj_11_strides_0, weight = layers_0_self_attn_o_proj_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("obj_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_1_cast_fp16, y = obj_11_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_462_to_fp16 = const()[name = string("op_462_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_463_cast_fp16 = add(x = variance_7_cast_fp16, y = var_462_to_fp16)[name = string("op_463_cast_fp16")];
+            fp32 var_464_epsilon_0 = const()[name = string("op_464_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_464_cast_fp16 = rsqrt(epsilon = var_464_epsilon_0, x = var_463_cast_fp16)[name = string("op_464_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_464_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8411328)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_3_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_3_cast_fp16")];
+            string input_5_pad_type_0 = const()[name = string("input_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_5_strides_0 = const()[name = string("input_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_5_pad_0 = const()[name = string("input_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_5_dilations_0 = const()[name = string("input_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_5_groups_0 = const()[name = string("input_5_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8413440))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11559232))))[name = string("layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_5_cast_fp16 = conv(dilations = input_5_dilations_0, groups = input_5_groups_0, pad = input_5_pad_0, pad_type = input_5_pad_type_0, strides = input_5_strides_0, weight = layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_478_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_478_cast_fp16")];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_484_strides_0 = const()[name = string("op_484_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_484_dilations_0 = const()[name = string("op_484_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_484_groups_0 = const()[name = string("op_484_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11559808))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14705600))))[name = string("layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_484_cast_fp16 = conv(dilations = var_484_dilations_0, groups = var_484_groups_0, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_484_strides_0, weight = layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("op_484_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_7_cast_fp16 = mul(x = var_478_cast_fp16, y = var_484_cast_fp16)[name = string("input_7_cast_fp16")];
+            string hidden_states_9_pad_type_0 = const()[name = string("hidden_states_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_9_strides_0 = const()[name = string("hidden_states_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_9_pad_0 = const()[name = string("hidden_states_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_9_dilations_0 = const()[name = string("hidden_states_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_9_groups_0 = const()[name = string("hidden_states_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_0_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14706176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17851968))))[name = string("layers_0_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_9_cast_fp16 = conv(dilations = hidden_states_9_dilations_0, groups = hidden_states_9_groups_0, pad = hidden_states_9_pad_0, pad_type = hidden_states_9_pad_type_0, strides = hidden_states_9_strides_0, weight = layers_0_mlp_down_proj_weight_to_fp16_palettized, x = input_7_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_9_cast_fp16 = add(x = inputs_7_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            int32 var_498 = const()[name = string("op_498"), val = int32(3)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            int32 var_516 = const()[name = string("op_516"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_528_to_fp16 = const()[name = string("op_528_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_529_cast_fp16 = add(x = variance_9_cast_fp16, y = var_528_to_fp16)[name = string("op_529_cast_fp16")];
+            fp32 var_530_epsilon_0 = const()[name = string("op_530_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_530_cast_fp16 = rsqrt(epsilon = var_530_epsilon_0, x = var_529_cast_fp16)[name = string("op_530_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_530_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17852544)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_13_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_11_cast_fp16)[name = string("obj_13_cast_fp16")];
+            string query_7_pad_type_0 = const()[name = string("query_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_7_strides_0 = const()[name = string("query_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_7_pad_0 = const()[name = string("query_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_7_dilations_0 = const()[name = string("query_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_7_groups_0 = const()[name = string("query_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17854656))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19951872))))[name = string("layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_7_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_7_dilations_0, groups = query_7_groups_0, pad = query_7_pad_0, pad_type = query_7_pad_type_0, strides = query_7_strides_0, weight = layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("query_7_cast_fp16")];
+            string current_key_5_pad_type_0 = const()[name = string("current_key_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_5_strides_0 = const()[name = string("current_key_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_5_pad_0 = const()[name = string("current_key_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_5_dilations_0 = const()[name = string("current_key_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_5_groups_0 = const()[name = string("current_key_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19952448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21001088))))[name = string("layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = conv(dilations = current_key_5_dilations_0, groups = current_key_5_groups_0, pad = current_key_5_pad_0, pad_type = current_key_5_pad_type_0, strides = current_key_5_strides_0, weight = layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            string current_value_3_pad_type_0 = const()[name = string("current_value_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_3_strides_0 = const()[name = string("current_value_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_3_pad_0 = const()[name = string("current_value_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_3_dilations_0 = const()[name = string("current_value_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_3_groups_0 = const()[name = string("current_value_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21001664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22050304))))[name = string("layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_3_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_3_dilations_0, groups = current_value_3_groups_0, pad = current_value_3_pad_0, pad_type = current_value_3_pad_type_0, strides = current_value_3_strides_0, weight = layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_value_3_cast_fp16")];
+            tensor<int32, [4]> var_567 = const()[name = string("op_567"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_11_cast_fp16 = reshape(shape = var_567, x = query_7_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_573_to_fp16 = const()[name = string("op_573_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_574_cast_fp16 = add(x = variance_11_cast_fp16, y = var_573_to_fp16)[name = string("op_574_cast_fp16")];
+            fp32 var_575_epsilon_0 = const()[name = string("op_575_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_575_cast_fp16 = rsqrt(epsilon = var_575_epsilon_0, x = var_574_cast_fp16)[name = string("op_575_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_575_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22050880)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_3_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_13_cast_fp16)[name = string("query_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_583 = const()[name = string("op_583"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_13_cast_fp16 = reshape(shape = var_583, x = current_key_5_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_589_to_fp16 = const()[name = string("op_589_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_590_cast_fp16 = add(x = variance_13_cast_fp16, y = var_589_to_fp16)[name = string("op_590_cast_fp16")];
+            fp32 var_591_epsilon_0 = const()[name = string("op_591_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_591_cast_fp16 = rsqrt(epsilon = var_591_epsilon_0, x = var_590_cast_fp16)[name = string("op_591_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_591_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22051200)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_3_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_15_cast_fp16)[name = string("current_key_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_609 = const()[name = string("op_609"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_7_cast_fp16 = reshape(shape = var_609, x = query_normed_3_cast_fp16)[name = string("mh_q_7_cast_fp16")];
+            tensor<int32, [4]> var_611 = const()[name = string("op_611"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_5_cast_fp16 = reshape(shape = var_611, x = current_key_normed_3_cast_fp16)[name = string("mh_k_5_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_615_cast_fp16 = mul(x = mh_q_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<int32, [4]> var_620_begin_0 = const()[name = string("op_620_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_620_end_0 = const()[name = string("op_620_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_620_end_mask_0 = const()[name = string("op_620_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_620_cast_fp16 = slice_by_index(begin = var_620_begin_0, end = var_620_end_0, end_mask = var_620_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_620_cast_fp16")];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_626_cast_fp16 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_626_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_628_cast_fp16 = mul(x = var_626_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_628_cast_fp16")];
+            bool var_630_interleave_0 = const()[name = string("op_630_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_630_cast_fp16 = concat(axis = var_508, interleave = var_630_interleave_0, values = (var_628_cast_fp16, var_620_cast_fp16))[name = string("op_630_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_631_cast_fp16 = mul(x = var_630_cast_fp16, y = sin_1_cast_fp16)[name = string("op_631_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_9_cast_fp16 = add(x = var_615_cast_fp16, y = var_631_cast_fp16)[name = string("mh_q_9_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_633_cast_fp16 = mul(x = mh_k_5_cast_fp16, y = cos_1_cast_fp16)[name = string("op_633_cast_fp16")];
+            tensor<int32, [4]> var_638_begin_0 = const()[name = string("op_638_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_638_end_0 = const()[name = string("op_638_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_638_end_mask_0 = const()[name = string("op_638_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_638_cast_fp16 = slice_by_index(begin = var_638_begin_0, end = var_638_end_0, end_mask = var_638_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_638_cast_fp16")];
+            tensor<int32, [4]> var_644_begin_0 = const()[name = string("op_644_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_644_end_0 = const()[name = string("op_644_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_644_end_mask_0 = const()[name = string("op_644_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_644_cast_fp16 = slice_by_index(begin = var_644_begin_0, end = var_644_end_0, end_mask = var_644_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_644_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_646_cast_fp16 = mul(x = var_644_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_646_cast_fp16")];
+            bool var_648_interleave_0 = const()[name = string("op_648_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_648_cast_fp16 = concat(axis = var_508, interleave = var_648_interleave_0, values = (var_646_cast_fp16, var_638_cast_fp16))[name = string("op_648_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_649_cast_fp16 = mul(x = var_648_cast_fp16, y = sin_1_cast_fp16)[name = string("op_649_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_7_cast_fp16 = add(x = var_633_cast_fp16, y = var_649_cast_fp16)[name = string("mh_k_7_cast_fp16")];
+            tensor<int32, [4]> var_653 = const()[name = string("op_653"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_653, x = mh_k_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_660_cast_fp16 = mul(x = var_96_cast_fp16_1, y = var_272_cast_fp16)[name = string("op_660_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_661_cast_fp16 = mul(x = current_key_7_cast_fp16, y = var_270_cast_fp16)[name = string("op_661_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_9_cast_fp16 = add(x = var_660_cast_fp16, y = var_661_cast_fp16)[name = string("key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_664_cast_fp16 = mul(x = var_104_cast_fp16_1, y = var_272_cast_fp16)[name = string("op_664_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_665_cast_fp16 = mul(x = current_value_3_cast_fp16, y = var_270_cast_fp16)[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_5_cast_fp16 = add(x = var_664_cast_fp16, y = var_665_cast_fp16)[name = string("value_5_cast_fp16")];
+            tensor<int32, [4]> var_669 = const()[name = string("op_669"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_5_cast_fp16 = reshape(shape = var_669, x = key_9_cast_fp16)[name = string("key_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_671 = const()[name = string("op_671"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_5_cast_fp16 = reshape(shape = var_671, x = value_5_cast_fp16)[name = string("value_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_674_begin_0 = const()[name = string("op_674_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_674_end_0 = const()[name = string("op_674_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_674_end_mask_0 = const()[name = string("op_674_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_674_cast_fp16 = slice_by_index(begin = var_674_begin_0, end = var_674_end_0, end_mask = var_674_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_674_cast_fp16")];
+            tensor<int32, [4]> var_678_begin_0 = const()[name = string("op_678_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_678_end_0 = const()[name = string("op_678_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_678_end_mask_0 = const()[name = string("op_678_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_678_cast_fp16 = slice_by_index(begin = var_678_begin_0, end = var_678_end_0, end_mask = var_678_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_678_cast_fp16")];
+            tensor<int32, [4]> var_690_begin_0 = const()[name = string("op_690_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_690_end_0 = const()[name = string("op_690_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_690_end_mask_0 = const()[name = string("op_690_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_690_cast_fp16 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_690_cast_fp16")];
+            tensor<int32, [4]> var_694_begin_0 = const()[name = string("op_694_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_694_end_0 = const()[name = string("op_694_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_694_end_mask_0 = const()[name = string("op_694_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_694_cast_fp16 = slice_by_index(begin = var_694_begin_0, end = var_694_end_0, end_mask = var_694_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_694_cast_fp16")];
+            tensor<int32, [4]> var_706_begin_0 = const()[name = string("op_706_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_706_end_0 = const()[name = string("op_706_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_706_end_mask_0 = const()[name = string("op_706_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_706_cast_fp16 = slice_by_index(begin = var_706_begin_0, end = var_706_end_0, end_mask = var_706_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<int32, [4]> var_710_begin_0 = const()[name = string("op_710_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_710_end_0 = const()[name = string("op_710_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_710_end_mask_0 = const()[name = string("op_710_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_710_cast_fp16 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_710_cast_fp16")];
+            tensor<int32, [4]> var_722_begin_0 = const()[name = string("op_722_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_722_end_0 = const()[name = string("op_722_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_722_end_mask_0 = const()[name = string("op_722_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_722_cast_fp16 = slice_by_index(begin = var_722_begin_0, end = var_722_end_0, end_mask = var_722_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_722_cast_fp16")];
+            tensor<int32, [4]> var_726_begin_0 = const()[name = string("op_726_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_726_end_0 = const()[name = string("op_726_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_726_end_mask_0 = const()[name = string("op_726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_726_cast_fp16 = slice_by_index(begin = var_726_begin_0, end = var_726_end_0, end_mask = var_726_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_726_cast_fp16")];
+            tensor<int32, [4]> var_738_begin_0 = const()[name = string("op_738_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_738_end_0 = const()[name = string("op_738_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_738_end_mask_0 = const()[name = string("op_738_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_738_cast_fp16 = slice_by_index(begin = var_738_begin_0, end = var_738_end_0, end_mask = var_738_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_738_cast_fp16")];
+            tensor<int32, [4]> var_742_begin_0 = const()[name = string("op_742_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_742_end_0 = const()[name = string("op_742_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_742_end_mask_0 = const()[name = string("op_742_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_742_cast_fp16 = slice_by_index(begin = var_742_begin_0, end = var_742_end_0, end_mask = var_742_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_742_cast_fp16")];
+            tensor<int32, [4]> var_754_begin_0 = const()[name = string("op_754_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_754_end_0 = const()[name = string("op_754_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_754_end_mask_0 = const()[name = string("op_754_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_754_cast_fp16 = slice_by_index(begin = var_754_begin_0, end = var_754_end_0, end_mask = var_754_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_754_cast_fp16")];
+            tensor<int32, [4]> var_758_begin_0 = const()[name = string("op_758_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_758_end_0 = const()[name = string("op_758_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_758_end_mask_0 = const()[name = string("op_758_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_758_cast_fp16 = slice_by_index(begin = var_758_begin_0, end = var_758_end_0, end_mask = var_758_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_758_cast_fp16")];
+            tensor<int32, [4]> var_770_begin_0 = const()[name = string("op_770_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_770_end_0 = const()[name = string("op_770_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_770_end_mask_0 = const()[name = string("op_770_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_770_cast_fp16 = slice_by_index(begin = var_770_begin_0, end = var_770_end_0, end_mask = var_770_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_770_cast_fp16")];
+            tensor<int32, [4]> var_774_begin_0 = const()[name = string("op_774_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_774_end_0 = const()[name = string("op_774_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_774_end_mask_0 = const()[name = string("op_774_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_774_cast_fp16 = slice_by_index(begin = var_774_begin_0, end = var_774_end_0, end_mask = var_774_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_774_cast_fp16")];
+            tensor<int32, [4]> var_786_begin_0 = const()[name = string("op_786_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_786_end_0 = const()[name = string("op_786_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_786_end_mask_0 = const()[name = string("op_786_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_786_cast_fp16 = slice_by_index(begin = var_786_begin_0, end = var_786_end_0, end_mask = var_786_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_786_cast_fp16")];
+            tensor<int32, [4]> var_790_begin_0 = const()[name = string("op_790_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_790_end_0 = const()[name = string("op_790_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_790_end_mask_0 = const()[name = string("op_790_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_790_cast_fp16 = slice_by_index(begin = var_790_begin_0, end = var_790_end_0, end_mask = var_790_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_790_cast_fp16")];
+            bool key_heads_7_interleave_0 = const()[name = string("key_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_7_cast_fp16 = concat(axis = var_516, interleave = key_heads_7_interleave_0, values = (var_674_cast_fp16, var_674_cast_fp16, var_690_cast_fp16, var_690_cast_fp16, var_706_cast_fp16, var_706_cast_fp16, var_722_cast_fp16, var_722_cast_fp16, var_738_cast_fp16, var_738_cast_fp16, var_754_cast_fp16, var_754_cast_fp16, var_770_cast_fp16, var_770_cast_fp16, var_786_cast_fp16, var_786_cast_fp16))[name = string("key_heads_7_cast_fp16")];
+            bool value_heads_7_interleave_0 = const()[name = string("value_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_7_cast_fp16 = concat(axis = var_516, interleave = value_heads_7_interleave_0, values = (var_678_cast_fp16, var_678_cast_fp16, var_694_cast_fp16, var_694_cast_fp16, var_710_cast_fp16, var_710_cast_fp16, var_726_cast_fp16, var_726_cast_fp16, var_742_cast_fp16, var_742_cast_fp16, var_758_cast_fp16, var_758_cast_fp16, var_774_cast_fp16, var_774_cast_fp16, var_790_cast_fp16, var_790_cast_fp16))[name = string("value_heads_7_cast_fp16")];
+            fp16 var_813_to_fp16 = const()[name = string("op_813_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_814_cast_fp16 = mul(x = mh_q_9_cast_fp16, y = var_813_to_fp16)[name = string("op_814_cast_fp16")];
+            bool mh_w_5_transpose_x_0 = const()[name = string("mh_w_5_transpose_x_0"), val = bool(true)];
+            bool mh_w_5_transpose_y_0 = const()[name = string("mh_w_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_5_cast_fp16 = matmul(transpose_x = mh_w_5_transpose_x_0, transpose_y = mh_w_5_transpose_y_0, x = var_814_cast_fp16, y = key_heads_7_cast_fp16)[name = string("mh_w_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_7_cast_fp16 = add(x = mh_w_5_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_7_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_826_cast_fp16 = softmax(axis = var_498, x = mh_w_7_cast_fp16)[name = string("op_826_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = value_heads_7_cast_fp16, y = var_826_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_831 = const()[name = string("op_831"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_9_cast_fp16 = reshape(shape = var_831, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            string obj_19_pad_type_0 = const()[name = string("obj_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_19_strides_0 = const()[name = string("obj_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_19_pad_0 = const()[name = string("obj_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_19_dilations_0 = const()[name = string("obj_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_19_groups_0 = const()[name = string("obj_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22051520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24148736))))[name = string("layers_1_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_19_dilations_0, groups = obj_19_groups_0, pad = obj_19_pad_0, pad_type = obj_19_pad_type_0, strides = obj_19_strides_0, weight = layers_1_self_attn_o_proj_weight_to_fp16_palettized, x = input_9_cast_fp16)[name = string("obj_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_9_cast_fp16, y = obj_19_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_849_to_fp16 = const()[name = string("op_849_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_850_cast_fp16 = add(x = variance_15_cast_fp16, y = var_849_to_fp16)[name = string("op_850_cast_fp16")];
+            fp32 var_851_epsilon_0 = const()[name = string("op_851_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_851_cast_fp16 = rsqrt(epsilon = var_851_epsilon_0, x = var_850_cast_fp16)[name = string("op_851_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_851_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24149312)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_11_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_17_cast_fp16)[name = string("input_11_cast_fp16")];
+            string input_13_pad_type_0 = const()[name = string("input_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_13_strides_0 = const()[name = string("input_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_13_pad_0 = const()[name = string("input_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_13_dilations_0 = const()[name = string("input_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_13_groups_0 = const()[name = string("input_13_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24151424))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27297216))))[name = string("layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_13_cast_fp16 = conv(dilations = input_13_dilations_0, groups = input_13_groups_0, pad = input_13_pad_0, pad_type = input_13_pad_type_0, strides = input_13_strides_0, weight = layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_865_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_865_cast_fp16")];
+            string var_871_pad_type_0 = const()[name = string("op_871_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_871_strides_0 = const()[name = string("op_871_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_871_pad_0 = const()[name = string("op_871_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_871_dilations_0 = const()[name = string("op_871_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_871_groups_0 = const()[name = string("op_871_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27297792))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30443584))))[name = string("layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_871_cast_fp16 = conv(dilations = var_871_dilations_0, groups = var_871_groups_0, pad = var_871_pad_0, pad_type = var_871_pad_type_0, strides = var_871_strides_0, weight = layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("op_871_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_15_cast_fp16 = mul(x = var_865_cast_fp16, y = var_871_cast_fp16)[name = string("input_15_cast_fp16")];
+            string hidden_states_19_pad_type_0 = const()[name = string("hidden_states_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_19_strides_0 = const()[name = string("hidden_states_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_19_pad_0 = const()[name = string("hidden_states_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_19_dilations_0 = const()[name = string("hidden_states_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_19_groups_0 = const()[name = string("hidden_states_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_1_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30444160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33589952))))[name = string("layers_1_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_19_cast_fp16 = conv(dilations = hidden_states_19_dilations_0, groups = hidden_states_19_groups_0, pad = hidden_states_19_pad_0, pad_type = hidden_states_19_pad_type_0, strides = hidden_states_19_strides_0, weight = layers_1_mlp_down_proj_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_17_cast_fp16 = add(x = inputs_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            int32 var_885 = const()[name = string("op_885"), val = int32(3)];
+            int32 var_895 = const()[name = string("op_895"), val = int32(-2)];
+            int32 var_903 = const()[name = string("op_903"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_915_to_fp16 = const()[name = string("op_915_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_916_cast_fp16 = add(x = variance_17_cast_fp16, y = var_915_to_fp16)[name = string("op_916_cast_fp16")];
+            fp32 var_917_epsilon_0 = const()[name = string("op_917_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_917_cast_fp16 = rsqrt(epsilon = var_917_epsilon_0, x = var_916_cast_fp16)[name = string("op_917_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_917_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33590528)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_21_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_21_cast_fp16)[name = string("obj_21_cast_fp16")];
+            string query_13_pad_type_0 = const()[name = string("query_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_13_strides_0 = const()[name = string("query_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_13_pad_0 = const()[name = string("query_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_13_dilations_0 = const()[name = string("query_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_13_groups_0 = const()[name = string("query_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33592640))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35689856))))[name = string("layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_13_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_13_dilations_0, groups = query_13_groups_0, pad = query_13_pad_0, pad_type = query_13_pad_type_0, strides = query_13_strides_0, weight = layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("query_13_cast_fp16")];
+            string current_key_9_pad_type_0 = const()[name = string("current_key_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_9_strides_0 = const()[name = string("current_key_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_9_pad_0 = const()[name = string("current_key_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_9_dilations_0 = const()[name = string("current_key_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_9_groups_0 = const()[name = string("current_key_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35690432))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36739072))))[name = string("layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = conv(dilations = current_key_9_dilations_0, groups = current_key_9_groups_0, pad = current_key_9_pad_0, pad_type = current_key_9_pad_type_0, strides = current_key_9_strides_0, weight = layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            string current_value_5_pad_type_0 = const()[name = string("current_value_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_5_strides_0 = const()[name = string("current_value_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_5_pad_0 = const()[name = string("current_value_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_5_dilations_0 = const()[name = string("current_value_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_5_groups_0 = const()[name = string("current_value_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36739648))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37788288))))[name = string("layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_5_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_5_dilations_0, groups = current_value_5_groups_0, pad = current_value_5_pad_0, pad_type = current_value_5_pad_type_0, strides = current_value_5_strides_0, weight = layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_value_5_cast_fp16")];
+            tensor<int32, [4]> var_954 = const()[name = string("op_954"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_19_cast_fp16 = reshape(shape = var_954, x = query_13_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_960_to_fp16 = const()[name = string("op_960_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_961_cast_fp16 = add(x = variance_19_cast_fp16, y = var_960_to_fp16)[name = string("op_961_cast_fp16")];
+            fp32 var_962_epsilon_0 = const()[name = string("op_962_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_962_cast_fp16 = rsqrt(epsilon = var_962_epsilon_0, x = var_961_cast_fp16)[name = string("op_962_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_962_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37788864)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_5_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_23_cast_fp16)[name = string("query_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_970 = const()[name = string("op_970"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_21_cast_fp16 = reshape(shape = var_970, x = current_key_9_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_976_to_fp16 = const()[name = string("op_976_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_977_cast_fp16 = add(x = variance_21_cast_fp16, y = var_976_to_fp16)[name = string("op_977_cast_fp16")];
+            fp32 var_978_epsilon_0 = const()[name = string("op_978_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_978_cast_fp16 = rsqrt(epsilon = var_978_epsilon_0, x = var_977_cast_fp16)[name = string("op_978_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_978_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37789184)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_5_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_25_cast_fp16)[name = string("current_key_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_996 = const()[name = string("op_996"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_13_cast_fp16 = reshape(shape = var_996, x = query_normed_5_cast_fp16)[name = string("mh_q_13_cast_fp16")];
+            tensor<int32, [4]> var_998 = const()[name = string("op_998"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_9_cast_fp16 = reshape(shape = var_998, x = current_key_normed_5_cast_fp16)[name = string("mh_k_9_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1002_cast_fp16 = mul(x = mh_q_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1002_cast_fp16")];
+            tensor<int32, [4]> var_1007_begin_0 = const()[name = string("op_1007_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1007_end_0 = const()[name = string("op_1007_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1007_end_mask_0 = const()[name = string("op_1007_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1007_cast_fp16 = slice_by_index(begin = var_1007_begin_0, end = var_1007_end_0, end_mask = var_1007_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1007_cast_fp16")];
+            tensor<int32, [4]> var_1013_begin_0 = const()[name = string("op_1013_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1013_end_0 = const()[name = string("op_1013_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1013_end_mask_0 = const()[name = string("op_1013_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1013_cast_fp16 = slice_by_index(begin = var_1013_begin_0, end = var_1013_end_0, end_mask = var_1013_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1013_cast_fp16")];
+            fp16 const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1015_cast_fp16 = mul(x = var_1013_cast_fp16, y = const_63_promoted_to_fp16)[name = string("op_1015_cast_fp16")];
+            bool var_1017_interleave_0 = const()[name = string("op_1017_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1017_cast_fp16 = concat(axis = var_895, interleave = var_1017_interleave_0, values = (var_1015_cast_fp16, var_1007_cast_fp16))[name = string("op_1017_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1018_cast_fp16 = mul(x = var_1017_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1018_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_15_cast_fp16 = add(x = var_1002_cast_fp16, y = var_1018_cast_fp16)[name = string("mh_q_15_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1020_cast_fp16 = mul(x = mh_k_9_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1020_cast_fp16")];
+            tensor<int32, [4]> var_1025_begin_0 = const()[name = string("op_1025_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1025_end_0 = const()[name = string("op_1025_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1025_end_mask_0 = const()[name = string("op_1025_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1025_cast_fp16 = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1025_cast_fp16")];
+            tensor<int32, [4]> var_1031_begin_0 = const()[name = string("op_1031_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1031_end_0 = const()[name = string("op_1031_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1031_end_mask_0 = const()[name = string("op_1031_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1031_cast_fp16 = slice_by_index(begin = var_1031_begin_0, end = var_1031_end_0, end_mask = var_1031_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1031_cast_fp16")];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1033_cast_fp16 = mul(x = var_1031_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_1033_cast_fp16")];
+            bool var_1035_interleave_0 = const()[name = string("op_1035_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1035_cast_fp16 = concat(axis = var_895, interleave = var_1035_interleave_0, values = (var_1033_cast_fp16, var_1025_cast_fp16))[name = string("op_1035_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1036_cast_fp16 = mul(x = var_1035_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1036_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_11_cast_fp16 = add(x = var_1020_cast_fp16, y = var_1036_cast_fp16)[name = string("mh_k_11_cast_fp16")];
+            tensor<int32, [4]> var_1040 = const()[name = string("op_1040"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1040, x = mh_k_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1047_cast_fp16 = mul(x = var_96_cast_fp16_2, y = var_272_cast_fp16)[name = string("op_1047_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1048_cast_fp16 = mul(x = current_key_11_cast_fp16, y = var_270_cast_fp16)[name = string("op_1048_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_15_cast_fp16 = add(x = var_1047_cast_fp16, y = var_1048_cast_fp16)[name = string("key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1051_cast_fp16 = mul(x = var_104_cast_fp16_2, y = var_272_cast_fp16)[name = string("op_1051_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1052_cast_fp16 = mul(x = current_value_5_cast_fp16, y = var_270_cast_fp16)[name = string("op_1052_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_9_cast_fp16 = add(x = var_1051_cast_fp16, y = var_1052_cast_fp16)[name = string("value_9_cast_fp16")];
+            tensor<int32, [4]> var_1056 = const()[name = string("op_1056"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_9_cast_fp16 = reshape(shape = var_1056, x = key_15_cast_fp16)[name = string("key_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1058 = const()[name = string("op_1058"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_9_cast_fp16 = reshape(shape = var_1058, x = value_9_cast_fp16)[name = string("value_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1061_begin_0 = const()[name = string("op_1061_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1061_end_0 = const()[name = string("op_1061_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1061_end_mask_0 = const()[name = string("op_1061_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1061_cast_fp16 = slice_by_index(begin = var_1061_begin_0, end = var_1061_end_0, end_mask = var_1061_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1061_cast_fp16")];
+            tensor<int32, [4]> var_1065_begin_0 = const()[name = string("op_1065_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1065_end_0 = const()[name = string("op_1065_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1065_end_mask_0 = const()[name = string("op_1065_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1065_cast_fp16 = slice_by_index(begin = var_1065_begin_0, end = var_1065_end_0, end_mask = var_1065_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1065_cast_fp16")];
+            tensor<int32, [4]> var_1077_begin_0 = const()[name = string("op_1077_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1077_end_0 = const()[name = string("op_1077_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1077_end_mask_0 = const()[name = string("op_1077_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1077_cast_fp16 = slice_by_index(begin = var_1077_begin_0, end = var_1077_end_0, end_mask = var_1077_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1077_cast_fp16")];
+            tensor<int32, [4]> var_1081_begin_0 = const()[name = string("op_1081_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1081_end_0 = const()[name = string("op_1081_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1081_end_mask_0 = const()[name = string("op_1081_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1081_cast_fp16 = slice_by_index(begin = var_1081_begin_0, end = var_1081_end_0, end_mask = var_1081_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1081_cast_fp16")];
+            tensor<int32, [4]> var_1093_begin_0 = const()[name = string("op_1093_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1093_end_0 = const()[name = string("op_1093_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1093_end_mask_0 = const()[name = string("op_1093_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1093_cast_fp16 = slice_by_index(begin = var_1093_begin_0, end = var_1093_end_0, end_mask = var_1093_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1093_cast_fp16")];
+            tensor<int32, [4]> var_1097_begin_0 = const()[name = string("op_1097_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1097_end_0 = const()[name = string("op_1097_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1097_end_mask_0 = const()[name = string("op_1097_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1097_cast_fp16 = slice_by_index(begin = var_1097_begin_0, end = var_1097_end_0, end_mask = var_1097_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1097_cast_fp16")];
+            tensor<int32, [4]> var_1109_begin_0 = const()[name = string("op_1109_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1109_end_0 = const()[name = string("op_1109_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1109_end_mask_0 = const()[name = string("op_1109_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1109_cast_fp16 = slice_by_index(begin = var_1109_begin_0, end = var_1109_end_0, end_mask = var_1109_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1109_cast_fp16")];
+            tensor<int32, [4]> var_1113_begin_0 = const()[name = string("op_1113_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1113_end_0 = const()[name = string("op_1113_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1113_end_mask_0 = const()[name = string("op_1113_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1113_cast_fp16 = slice_by_index(begin = var_1113_begin_0, end = var_1113_end_0, end_mask = var_1113_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1113_cast_fp16")];
+            tensor<int32, [4]> var_1125_begin_0 = const()[name = string("op_1125_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1125_end_0 = const()[name = string("op_1125_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1125_end_mask_0 = const()[name = string("op_1125_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1125_cast_fp16 = slice_by_index(begin = var_1125_begin_0, end = var_1125_end_0, end_mask = var_1125_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1125_cast_fp16")];
+            tensor<int32, [4]> var_1129_begin_0 = const()[name = string("op_1129_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1129_end_0 = const()[name = string("op_1129_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1129_end_mask_0 = const()[name = string("op_1129_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1129_cast_fp16 = slice_by_index(begin = var_1129_begin_0, end = var_1129_end_0, end_mask = var_1129_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1129_cast_fp16")];
+            tensor<int32, [4]> var_1141_begin_0 = const()[name = string("op_1141_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1141_end_0 = const()[name = string("op_1141_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1141_end_mask_0 = const()[name = string("op_1141_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1141_cast_fp16 = slice_by_index(begin = var_1141_begin_0, end = var_1141_end_0, end_mask = var_1141_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1141_cast_fp16")];
+            tensor<int32, [4]> var_1145_begin_0 = const()[name = string("op_1145_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1145_end_0 = const()[name = string("op_1145_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1145_end_mask_0 = const()[name = string("op_1145_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1145_cast_fp16 = slice_by_index(begin = var_1145_begin_0, end = var_1145_end_0, end_mask = var_1145_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1145_cast_fp16")];
+            tensor<int32, [4]> var_1157_begin_0 = const()[name = string("op_1157_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1157_end_0 = const()[name = string("op_1157_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1157_end_mask_0 = const()[name = string("op_1157_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1157_cast_fp16 = slice_by_index(begin = var_1157_begin_0, end = var_1157_end_0, end_mask = var_1157_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1157_cast_fp16")];
+            tensor<int32, [4]> var_1161_begin_0 = const()[name = string("op_1161_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1161_end_0 = const()[name = string("op_1161_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1161_end_mask_0 = const()[name = string("op_1161_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1161_cast_fp16 = slice_by_index(begin = var_1161_begin_0, end = var_1161_end_0, end_mask = var_1161_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1161_cast_fp16")];
+            tensor<int32, [4]> var_1173_begin_0 = const()[name = string("op_1173_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1173_end_0 = const()[name = string("op_1173_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1173_end_mask_0 = const()[name = string("op_1173_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1173_cast_fp16 = slice_by_index(begin = var_1173_begin_0, end = var_1173_end_0, end_mask = var_1173_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1173_cast_fp16")];
+            tensor<int32, [4]> var_1177_begin_0 = const()[name = string("op_1177_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1177_end_0 = const()[name = string("op_1177_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1177_end_mask_0 = const()[name = string("op_1177_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1177_cast_fp16 = slice_by_index(begin = var_1177_begin_0, end = var_1177_end_0, end_mask = var_1177_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1177_cast_fp16")];
+            bool key_heads_11_interleave_0 = const()[name = string("key_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_11_cast_fp16 = concat(axis = var_903, interleave = key_heads_11_interleave_0, values = (var_1061_cast_fp16, var_1061_cast_fp16, var_1077_cast_fp16, var_1077_cast_fp16, var_1093_cast_fp16, var_1093_cast_fp16, var_1109_cast_fp16, var_1109_cast_fp16, var_1125_cast_fp16, var_1125_cast_fp16, var_1141_cast_fp16, var_1141_cast_fp16, var_1157_cast_fp16, var_1157_cast_fp16, var_1173_cast_fp16, var_1173_cast_fp16))[name = string("key_heads_11_cast_fp16")];
+            bool value_heads_11_interleave_0 = const()[name = string("value_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_11_cast_fp16 = concat(axis = var_903, interleave = value_heads_11_interleave_0, values = (var_1065_cast_fp16, var_1065_cast_fp16, var_1081_cast_fp16, var_1081_cast_fp16, var_1097_cast_fp16, var_1097_cast_fp16, var_1113_cast_fp16, var_1113_cast_fp16, var_1129_cast_fp16, var_1129_cast_fp16, var_1145_cast_fp16, var_1145_cast_fp16, var_1161_cast_fp16, var_1161_cast_fp16, var_1177_cast_fp16, var_1177_cast_fp16))[name = string("value_heads_11_cast_fp16")];
+            fp16 var_1200_to_fp16 = const()[name = string("op_1200_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1201_cast_fp16 = mul(x = mh_q_15_cast_fp16, y = var_1200_to_fp16)[name = string("op_1201_cast_fp16")];
+            bool mh_w_9_transpose_x_0 = const()[name = string("mh_w_9_transpose_x_0"), val = bool(true)];
+            bool mh_w_9_transpose_y_0 = const()[name = string("mh_w_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_9_cast_fp16 = matmul(transpose_x = mh_w_9_transpose_x_0, transpose_y = mh_w_9_transpose_y_0, x = var_1201_cast_fp16, y = key_heads_11_cast_fp16)[name = string("mh_w_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_11_cast_fp16 = add(x = mh_w_9_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_11_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1213_cast_fp16 = softmax(axis = var_885, x = mh_w_11_cast_fp16)[name = string("op_1213_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = value_heads_11_cast_fp16, y = var_1213_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1218 = const()[name = string("op_1218"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_17_cast_fp16 = reshape(shape = var_1218, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            string obj_27_pad_type_0 = const()[name = string("obj_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_27_strides_0 = const()[name = string("obj_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_27_pad_0 = const()[name = string("obj_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_27_dilations_0 = const()[name = string("obj_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_27_groups_0 = const()[name = string("obj_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37789504))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39886720))))[name = string("layers_2_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_27_dilations_0, groups = obj_27_groups_0, pad = obj_27_pad_0, pad_type = obj_27_pad_type_0, strides = obj_27_strides_0, weight = layers_2_self_attn_o_proj_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = string("obj_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_17_cast_fp16, y = obj_27_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1236_to_fp16 = const()[name = string("op_1236_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1237_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1236_to_fp16)[name = string("op_1237_cast_fp16")];
+            fp32 var_1238_epsilon_0 = const()[name = string("op_1238_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1238_cast_fp16 = rsqrt(epsilon = var_1238_epsilon_0, x = var_1237_cast_fp16)[name = string("op_1238_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1238_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39887296)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_19_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_19_cast_fp16")];
+            string input_21_pad_type_0 = const()[name = string("input_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_21_strides_0 = const()[name = string("input_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_21_pad_0 = const()[name = string("input_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_21_dilations_0 = const()[name = string("input_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_21_groups_0 = const()[name = string("input_21_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39889408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43035200))))[name = string("layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_21_cast_fp16 = conv(dilations = input_21_dilations_0, groups = input_21_groups_0, pad = input_21_pad_0, pad_type = input_21_pad_type_0, strides = input_21_strides_0, weight = layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("input_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1252_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_1252_cast_fp16")];
+            string var_1258_pad_type_0 = const()[name = string("op_1258_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1258_strides_0 = const()[name = string("op_1258_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1258_pad_0 = const()[name = string("op_1258_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1258_dilations_0 = const()[name = string("op_1258_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1258_groups_0 = const()[name = string("op_1258_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43035776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46181568))))[name = string("layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1258_cast_fp16 = conv(dilations = var_1258_dilations_0, groups = var_1258_groups_0, pad = var_1258_pad_0, pad_type = var_1258_pad_type_0, strides = var_1258_strides_0, weight = layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("op_1258_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_23_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1258_cast_fp16)[name = string("input_23_cast_fp16")];
+            string hidden_states_29_pad_type_0 = const()[name = string("hidden_states_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_29_strides_0 = const()[name = string("hidden_states_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_29_pad_0 = const()[name = string("hidden_states_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_29_dilations_0 = const()[name = string("hidden_states_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_29_groups_0 = const()[name = string("hidden_states_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_2_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46182144))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49327936))))[name = string("layers_2_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_29_cast_fp16 = conv(dilations = hidden_states_29_dilations_0, groups = hidden_states_29_groups_0, pad = hidden_states_29_pad_0, pad_type = hidden_states_29_pad_type_0, strides = hidden_states_29_strides_0, weight = layers_2_mlp_down_proj_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_25_cast_fp16 = add(x = inputs_23_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            int32 var_1272 = const()[name = string("op_1272"), val = int32(3)];
+            int32 var_1282 = const()[name = string("op_1282"), val = int32(-2)];
+            int32 var_1290 = const()[name = string("op_1290"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1302_to_fp16 = const()[name = string("op_1302_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1303_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1302_to_fp16)[name = string("op_1303_cast_fp16")];
+            fp32 var_1304_epsilon_0 = const()[name = string("op_1304_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1304_cast_fp16 = rsqrt(epsilon = var_1304_epsilon_0, x = var_1303_cast_fp16)[name = string("op_1304_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1304_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49328512)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_29_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_31_cast_fp16)[name = string("obj_29_cast_fp16")];
+            string query_19_pad_type_0 = const()[name = string("query_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_19_strides_0 = const()[name = string("query_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_19_pad_0 = const()[name = string("query_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_19_dilations_0 = const()[name = string("query_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_19_groups_0 = const()[name = string("query_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49330624))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51427840))))[name = string("layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_19_dilations_0, groups = query_19_groups_0, pad = query_19_pad_0, pad_type = query_19_pad_type_0, strides = query_19_strides_0, weight = layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("query_19_cast_fp16")];
+            string current_key_13_pad_type_0 = const()[name = string("current_key_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_13_strides_0 = const()[name = string("current_key_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_13_pad_0 = const()[name = string("current_key_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_13_dilations_0 = const()[name = string("current_key_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_13_groups_0 = const()[name = string("current_key_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51428416))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(52477056))))[name = string("layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = conv(dilations = current_key_13_dilations_0, groups = current_key_13_groups_0, pad = current_key_13_pad_0, pad_type = current_key_13_pad_type_0, strides = current_key_13_strides_0, weight = layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            string current_value_7_pad_type_0 = const()[name = string("current_value_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_7_strides_0 = const()[name = string("current_value_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_7_pad_0 = const()[name = string("current_value_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_7_dilations_0 = const()[name = string("current_value_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_7_groups_0 = const()[name = string("current_value_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(52477632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53526272))))[name = string("layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_7_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_7_dilations_0, groups = current_value_7_groups_0, pad = current_value_7_pad_0, pad_type = current_value_7_pad_type_0, strides = current_value_7_strides_0, weight = layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_value_7_cast_fp16")];
+            tensor<int32, [4]> var_1341 = const()[name = string("op_1341"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_27_cast_fp16 = reshape(shape = var_1341, x = query_19_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1347_to_fp16 = const()[name = string("op_1347_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1348_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1347_to_fp16)[name = string("op_1348_cast_fp16")];
+            fp32 var_1349_epsilon_0 = const()[name = string("op_1349_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1349_cast_fp16 = rsqrt(epsilon = var_1349_epsilon_0, x = var_1348_cast_fp16)[name = string("op_1349_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_33_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1349_cast_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53526848)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_7_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_33_cast_fp16)[name = string("query_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1357 = const()[name = string("op_1357"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_29_cast_fp16 = reshape(shape = var_1357, x = current_key_13_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1363_to_fp16 = const()[name = string("op_1363_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1364_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1363_to_fp16)[name = string("op_1364_cast_fp16")];
+            fp32 var_1365_epsilon_0 = const()[name = string("op_1365_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1365_cast_fp16 = rsqrt(epsilon = var_1365_epsilon_0, x = var_1364_cast_fp16)[name = string("op_1365_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_35_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1365_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53527168)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_7_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_35_cast_fp16)[name = string("current_key_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1383 = const()[name = string("op_1383"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_19_cast_fp16 = reshape(shape = var_1383, x = query_normed_7_cast_fp16)[name = string("mh_q_19_cast_fp16")];
+            tensor<int32, [4]> var_1385 = const()[name = string("op_1385"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_13_cast_fp16 = reshape(shape = var_1385, x = current_key_normed_7_cast_fp16)[name = string("mh_k_13_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1389_cast_fp16 = mul(x = mh_q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1389_cast_fp16")];
+            tensor<int32, [4]> var_1394_begin_0 = const()[name = string("op_1394_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1394_end_0 = const()[name = string("op_1394_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1394_end_mask_0 = const()[name = string("op_1394_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1394_cast_fp16 = slice_by_index(begin = var_1394_begin_0, end = var_1394_end_0, end_mask = var_1394_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1394_cast_fp16")];
+            tensor<int32, [4]> var_1400_begin_0 = const()[name = string("op_1400_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1400_end_0 = const()[name = string("op_1400_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1400_end_mask_0 = const()[name = string("op_1400_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1400_cast_fp16 = slice_by_index(begin = var_1400_begin_0, end = var_1400_end_0, end_mask = var_1400_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1400_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1402_cast_fp16 = mul(x = var_1400_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1402_cast_fp16")];
+            bool var_1404_interleave_0 = const()[name = string("op_1404_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1404_cast_fp16 = concat(axis = var_1282, interleave = var_1404_interleave_0, values = (var_1402_cast_fp16, var_1394_cast_fp16))[name = string("op_1404_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1405_cast_fp16 = mul(x = var_1404_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1405_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_21_cast_fp16 = add(x = var_1389_cast_fp16, y = var_1405_cast_fp16)[name = string("mh_q_21_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1407_cast_fp16 = mul(x = mh_k_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1407_cast_fp16")];
+            tensor<int32, [4]> var_1412_begin_0 = const()[name = string("op_1412_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1412_end_0 = const()[name = string("op_1412_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1412_end_mask_0 = const()[name = string("op_1412_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1412_cast_fp16 = slice_by_index(begin = var_1412_begin_0, end = var_1412_end_0, end_mask = var_1412_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1412_cast_fp16")];
+            tensor<int32, [4]> var_1418_begin_0 = const()[name = string("op_1418_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1418_end_0 = const()[name = string("op_1418_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1418_end_mask_0 = const()[name = string("op_1418_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1418_cast_fp16 = slice_by_index(begin = var_1418_begin_0, end = var_1418_end_0, end_mask = var_1418_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1418_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1420_cast_fp16 = mul(x = var_1418_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1420_cast_fp16")];
+            bool var_1422_interleave_0 = const()[name = string("op_1422_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1422_cast_fp16 = concat(axis = var_1282, interleave = var_1422_interleave_0, values = (var_1420_cast_fp16, var_1412_cast_fp16))[name = string("op_1422_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1423_cast_fp16 = mul(x = var_1422_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1423_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_15_cast_fp16 = add(x = var_1407_cast_fp16, y = var_1423_cast_fp16)[name = string("mh_k_15_cast_fp16")];
+            tensor<int32, [4]> var_1427 = const()[name = string("op_1427"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_15_cast_fp16 = reshape(shape = var_1427, x = mh_k_15_cast_fp16)[name = string("current_key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1434_cast_fp16 = mul(x = var_96_cast_fp16_3, y = var_272_cast_fp16)[name = string("op_1434_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1435_cast_fp16 = mul(x = current_key_15_cast_fp16, y = var_270_cast_fp16)[name = string("op_1435_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_21_cast_fp16 = add(x = var_1434_cast_fp16, y = var_1435_cast_fp16)[name = string("key_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1438_cast_fp16 = mul(x = var_104_cast_fp16_3, y = var_272_cast_fp16)[name = string("op_1438_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1439_cast_fp16 = mul(x = current_value_7_cast_fp16, y = var_270_cast_fp16)[name = string("op_1439_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_13_cast_fp16 = add(x = var_1438_cast_fp16, y = var_1439_cast_fp16)[name = string("value_13_cast_fp16")];
+            tensor<int32, [4]> var_1443 = const()[name = string("op_1443"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_13_cast_fp16 = reshape(shape = var_1443, x = key_21_cast_fp16)[name = string("key_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1445 = const()[name = string("op_1445"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_13_cast_fp16 = reshape(shape = var_1445, x = value_13_cast_fp16)[name = string("value_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1448_begin_0 = const()[name = string("op_1448_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1448_end_0 = const()[name = string("op_1448_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1448_end_mask_0 = const()[name = string("op_1448_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1448_cast_fp16 = slice_by_index(begin = var_1448_begin_0, end = var_1448_end_0, end_mask = var_1448_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1448_cast_fp16")];
+            tensor<int32, [4]> var_1452_begin_0 = const()[name = string("op_1452_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1452_end_0 = const()[name = string("op_1452_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1452_end_mask_0 = const()[name = string("op_1452_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1452_cast_fp16 = slice_by_index(begin = var_1452_begin_0, end = var_1452_end_0, end_mask = var_1452_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1452_cast_fp16")];
+            tensor<int32, [4]> var_1464_begin_0 = const()[name = string("op_1464_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1464_end_0 = const()[name = string("op_1464_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1464_end_mask_0 = const()[name = string("op_1464_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1464_cast_fp16 = slice_by_index(begin = var_1464_begin_0, end = var_1464_end_0, end_mask = var_1464_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1464_cast_fp16")];
+            tensor<int32, [4]> var_1468_begin_0 = const()[name = string("op_1468_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1468_end_0 = const()[name = string("op_1468_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1468_end_mask_0 = const()[name = string("op_1468_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1468_cast_fp16 = slice_by_index(begin = var_1468_begin_0, end = var_1468_end_0, end_mask = var_1468_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1468_cast_fp16")];
+            tensor<int32, [4]> var_1480_begin_0 = const()[name = string("op_1480_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1480_end_0 = const()[name = string("op_1480_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1480_end_mask_0 = const()[name = string("op_1480_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1480_cast_fp16 = slice_by_index(begin = var_1480_begin_0, end = var_1480_end_0, end_mask = var_1480_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1480_cast_fp16")];
+            tensor<int32, [4]> var_1484_begin_0 = const()[name = string("op_1484_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1484_end_0 = const()[name = string("op_1484_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1484_end_mask_0 = const()[name = string("op_1484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1484_cast_fp16 = slice_by_index(begin = var_1484_begin_0, end = var_1484_end_0, end_mask = var_1484_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1484_cast_fp16")];
+            tensor<int32, [4]> var_1496_begin_0 = const()[name = string("op_1496_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1496_end_0 = const()[name = string("op_1496_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1496_end_mask_0 = const()[name = string("op_1496_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1496_cast_fp16 = slice_by_index(begin = var_1496_begin_0, end = var_1496_end_0, end_mask = var_1496_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1496_cast_fp16")];
+            tensor<int32, [4]> var_1500_begin_0 = const()[name = string("op_1500_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1500_end_0 = const()[name = string("op_1500_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1500_end_mask_0 = const()[name = string("op_1500_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1500_cast_fp16 = slice_by_index(begin = var_1500_begin_0, end = var_1500_end_0, end_mask = var_1500_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1500_cast_fp16")];
+            tensor<int32, [4]> var_1512_begin_0 = const()[name = string("op_1512_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1512_end_0 = const()[name = string("op_1512_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1512_end_mask_0 = const()[name = string("op_1512_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1512_cast_fp16 = slice_by_index(begin = var_1512_begin_0, end = var_1512_end_0, end_mask = var_1512_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1512_cast_fp16")];
+            tensor<int32, [4]> var_1516_begin_0 = const()[name = string("op_1516_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1516_end_0 = const()[name = string("op_1516_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1516_end_mask_0 = const()[name = string("op_1516_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1516_cast_fp16 = slice_by_index(begin = var_1516_begin_0, end = var_1516_end_0, end_mask = var_1516_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1516_cast_fp16")];
+            tensor<int32, [4]> var_1528_begin_0 = const()[name = string("op_1528_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1528_end_0 = const()[name = string("op_1528_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1528_end_mask_0 = const()[name = string("op_1528_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1528_cast_fp16 = slice_by_index(begin = var_1528_begin_0, end = var_1528_end_0, end_mask = var_1528_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1528_cast_fp16")];
+            tensor<int32, [4]> var_1532_begin_0 = const()[name = string("op_1532_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1532_end_0 = const()[name = string("op_1532_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1532_end_mask_0 = const()[name = string("op_1532_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1532_cast_fp16 = slice_by_index(begin = var_1532_begin_0, end = var_1532_end_0, end_mask = var_1532_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1532_cast_fp16")];
+            tensor<int32, [4]> var_1544_begin_0 = const()[name = string("op_1544_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1544_end_0 = const()[name = string("op_1544_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1544_end_mask_0 = const()[name = string("op_1544_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1544_cast_fp16 = slice_by_index(begin = var_1544_begin_0, end = var_1544_end_0, end_mask = var_1544_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1544_cast_fp16")];
+            tensor<int32, [4]> var_1548_begin_0 = const()[name = string("op_1548_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1548_end_0 = const()[name = string("op_1548_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1548_end_mask_0 = const()[name = string("op_1548_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1548_cast_fp16 = slice_by_index(begin = var_1548_begin_0, end = var_1548_end_0, end_mask = var_1548_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1548_cast_fp16")];
+            tensor<int32, [4]> var_1560_begin_0 = const()[name = string("op_1560_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1560_end_0 = const()[name = string("op_1560_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1560_end_mask_0 = const()[name = string("op_1560_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1560_cast_fp16 = slice_by_index(begin = var_1560_begin_0, end = var_1560_end_0, end_mask = var_1560_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1560_cast_fp16")];
+            tensor<int32, [4]> var_1564_begin_0 = const()[name = string("op_1564_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1564_end_0 = const()[name = string("op_1564_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1564_end_mask_0 = const()[name = string("op_1564_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1564_cast_fp16 = slice_by_index(begin = var_1564_begin_0, end = var_1564_end_0, end_mask = var_1564_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1564_cast_fp16")];
+            bool key_heads_15_interleave_0 = const()[name = string("key_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_15_cast_fp16 = concat(axis = var_1290, interleave = key_heads_15_interleave_0, values = (var_1448_cast_fp16, var_1448_cast_fp16, var_1464_cast_fp16, var_1464_cast_fp16, var_1480_cast_fp16, var_1480_cast_fp16, var_1496_cast_fp16, var_1496_cast_fp16, var_1512_cast_fp16, var_1512_cast_fp16, var_1528_cast_fp16, var_1528_cast_fp16, var_1544_cast_fp16, var_1544_cast_fp16, var_1560_cast_fp16, var_1560_cast_fp16))[name = string("key_heads_15_cast_fp16")];
+            bool value_heads_15_interleave_0 = const()[name = string("value_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_15_cast_fp16 = concat(axis = var_1290, interleave = value_heads_15_interleave_0, values = (var_1452_cast_fp16, var_1452_cast_fp16, var_1468_cast_fp16, var_1468_cast_fp16, var_1484_cast_fp16, var_1484_cast_fp16, var_1500_cast_fp16, var_1500_cast_fp16, var_1516_cast_fp16, var_1516_cast_fp16, var_1532_cast_fp16, var_1532_cast_fp16, var_1548_cast_fp16, var_1548_cast_fp16, var_1564_cast_fp16, var_1564_cast_fp16))[name = string("value_heads_15_cast_fp16")];
+            fp16 var_1587_to_fp16 = const()[name = string("op_1587_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1588_cast_fp16 = mul(x = mh_q_21_cast_fp16, y = var_1587_to_fp16)[name = string("op_1588_cast_fp16")];
+            bool mh_w_13_transpose_x_0 = const()[name = string("mh_w_13_transpose_x_0"), val = bool(true)];
+            bool mh_w_13_transpose_y_0 = const()[name = string("mh_w_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_13_cast_fp16 = matmul(transpose_x = mh_w_13_transpose_x_0, transpose_y = mh_w_13_transpose_y_0, x = var_1588_cast_fp16, y = key_heads_15_cast_fp16)[name = string("mh_w_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_15_cast_fp16 = add(x = mh_w_13_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_15_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1600_cast_fp16 = softmax(axis = var_1272, x = mh_w_15_cast_fp16)[name = string("op_1600_cast_fp16")];
+            bool attn_7_transpose_x_0 = const()[name = string("attn_7_transpose_x_0"), val = bool(false)];
+            bool attn_7_transpose_y_0 = const()[name = string("attn_7_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_7_cast_fp16 = matmul(transpose_x = attn_7_transpose_x_0, transpose_y = attn_7_transpose_y_0, x = value_heads_15_cast_fp16, y = var_1600_cast_fp16)[name = string("attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1605 = const()[name = string("op_1605"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_25_cast_fp16 = reshape(shape = var_1605, x = attn_7_cast_fp16)[name = string("input_25_cast_fp16")];
+            string obj_35_pad_type_0 = const()[name = string("obj_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_35_strides_0 = const()[name = string("obj_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_35_pad_0 = const()[name = string("obj_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_35_dilations_0 = const()[name = string("obj_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_35_groups_0 = const()[name = string("obj_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53527488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55624704))))[name = string("layers_3_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_35_dilations_0, groups = obj_35_groups_0, pad = obj_35_pad_0, pad_type = obj_35_pad_type_0, strides = obj_35_strides_0, weight = layers_3_self_attn_o_proj_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = string("obj_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_25_cast_fp16, y = obj_35_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_1623_to_fp16 = const()[name = string("op_1623_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1624_cast_fp16 = add(x = variance_31_cast_fp16, y = var_1623_to_fp16)[name = string("op_1624_cast_fp16")];
+            fp32 var_1625_epsilon_0 = const()[name = string("op_1625_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1625_cast_fp16 = rsqrt(epsilon = var_1625_epsilon_0, x = var_1624_cast_fp16)[name = string("op_1625_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_37_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_1625_cast_fp16)[name = string("hidden_states_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55625280)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_27_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_37_cast_fp16)[name = string("input_27_cast_fp16")];
+            string input_29_pad_type_0 = const()[name = string("input_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_29_strides_0 = const()[name = string("input_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_29_pad_0 = const()[name = string("input_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_29_dilations_0 = const()[name = string("input_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_29_groups_0 = const()[name = string("input_29_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55627392))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58773184))))[name = string("layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_29_cast_fp16 = conv(dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("input_29_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1639_cast_fp16 = silu(x = input_29_cast_fp16)[name = string("op_1639_cast_fp16")];
+            string var_1645_pad_type_0 = const()[name = string("op_1645_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1645_strides_0 = const()[name = string("op_1645_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1645_pad_0 = const()[name = string("op_1645_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1645_dilations_0 = const()[name = string("op_1645_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1645_groups_0 = const()[name = string("op_1645_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58773760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61919552))))[name = string("layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1645_cast_fp16 = conv(dilations = var_1645_dilations_0, groups = var_1645_groups_0, pad = var_1645_pad_0, pad_type = var_1645_pad_type_0, strides = var_1645_strides_0, weight = layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("op_1645_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_31_cast_fp16 = mul(x = var_1639_cast_fp16, y = var_1645_cast_fp16)[name = string("input_31_cast_fp16")];
+            string hidden_states_39_pad_type_0 = const()[name = string("hidden_states_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_39_strides_0 = const()[name = string("hidden_states_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_39_pad_0 = const()[name = string("hidden_states_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_39_dilations_0 = const()[name = string("hidden_states_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_39_groups_0 = const()[name = string("hidden_states_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_3_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61920128))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65065920))))[name = string("layers_3_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_39_cast_fp16 = conv(dilations = hidden_states_39_dilations_0, groups = hidden_states_39_groups_0, pad = hidden_states_39_pad_0, pad_type = hidden_states_39_pad_type_0, strides = hidden_states_39_strides_0, weight = layers_3_mlp_down_proj_weight_to_fp16_palettized, x = input_31_cast_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_33_cast_fp16 = add(x = inputs_31_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("inputs_33_cast_fp16")];
+            int32 var_1659 = const()[name = string("op_1659"), val = int32(3)];
+            int32 var_1669 = const()[name = string("op_1669"), val = int32(-2)];
+            int32 var_1677 = const()[name = string("op_1677"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_33_cast_fp16 = mul(x = inputs_33_cast_fp16, y = inputs_33_cast_fp16)[name = string("inputs_sq_33_cast_fp16")];
+            tensor<int32, [1]> variance_33_axes_0 = const()[name = string("variance_33_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_33_keep_dims_0 = const()[name = string("variance_33_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_33_cast_fp16 = reduce_mean(axes = variance_33_axes_0, keep_dims = variance_33_keep_dims_0, x = inputs_sq_33_cast_fp16)[name = string("variance_33_cast_fp16")];
+            fp16 var_1689_to_fp16 = const()[name = string("op_1689_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1690_cast_fp16 = add(x = variance_33_cast_fp16, y = var_1689_to_fp16)[name = string("op_1690_cast_fp16")];
+            fp32 var_1691_epsilon_0 = const()[name = string("op_1691_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1691_cast_fp16 = rsqrt(epsilon = var_1691_epsilon_0, x = var_1690_cast_fp16)[name = string("op_1691_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_41_cast_fp16 = mul(x = inputs_33_cast_fp16, y = var_1691_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_33_to_fp16 = const()[name = string("w_33_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65066496)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_37_cast_fp16 = mul(x = w_33_to_fp16, y = hidden_states_41_cast_fp16)[name = string("obj_37_cast_fp16")];
+            string query_25_pad_type_0 = const()[name = string("query_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_25_strides_0 = const()[name = string("query_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_25_pad_0 = const()[name = string("query_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_25_dilations_0 = const()[name = string("query_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_25_groups_0 = const()[name = string("query_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65068608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67165824))))[name = string("layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_25_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_25_dilations_0, groups = query_25_groups_0, pad = query_25_pad_0, pad_type = query_25_pad_type_0, strides = query_25_strides_0, weight = layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("query_25_cast_fp16")];
+            string current_key_17_pad_type_0 = const()[name = string("current_key_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_17_strides_0 = const()[name = string("current_key_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_17_pad_0 = const()[name = string("current_key_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_17_dilations_0 = const()[name = string("current_key_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_17_groups_0 = const()[name = string("current_key_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67166400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68215040))))[name = string("layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_17_cast_fp16 = conv(dilations = current_key_17_dilations_0, groups = current_key_17_groups_0, pad = current_key_17_pad_0, pad_type = current_key_17_pad_type_0, strides = current_key_17_strides_0, weight = layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_key_17_cast_fp16")];
+            string current_value_pad_type_0 = const()[name = string("current_value_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_strides_0 = const()[name = string("current_value_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_pad_0 = const()[name = string("current_value_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_dilations_0 = const()[name = string("current_value_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_groups_0 = const()[name = string("current_value_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68215616))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69264256))))[name = string("layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_dilations_0, groups = current_value_groups_0, pad = current_value_pad_0, pad_type = current_value_pad_type_0, strides = current_value_strides_0, weight = layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_value_cast_fp16")];
+            tensor<int32, [4]> var_1728 = const()[name = string("op_1728"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_35_cast_fp16 = reshape(shape = var_1728, x = query_25_cast_fp16)[name = string("inputs_35_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_35_cast_fp16 = mul(x = inputs_35_cast_fp16, y = inputs_35_cast_fp16)[name = string("inputs_sq_35_cast_fp16")];
+            tensor<int32, [1]> variance_35_axes_0 = const()[name = string("variance_35_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_35_keep_dims_0 = const()[name = string("variance_35_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_35_cast_fp16 = reduce_mean(axes = variance_35_axes_0, keep_dims = variance_35_keep_dims_0, x = inputs_sq_35_cast_fp16)[name = string("variance_35_cast_fp16")];
+            fp16 var_1734_to_fp16 = const()[name = string("op_1734_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1735_cast_fp16 = add(x = variance_35_cast_fp16, y = var_1734_to_fp16)[name = string("op_1735_cast_fp16")];
+            fp32 var_1736_epsilon_0 = const()[name = string("op_1736_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1736_cast_fp16 = rsqrt(epsilon = var_1736_epsilon_0, x = var_1735_cast_fp16)[name = string("op_1736_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_43_cast_fp16 = mul(x = inputs_35_cast_fp16, y = var_1736_cast_fp16)[name = string("hidden_states_43_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_35_to_fp16 = const()[name = string("w_35_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69264832)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_cast_fp16 = mul(x = w_35_to_fp16, y = hidden_states_43_cast_fp16)[name = string("query_normed_cast_fp16")];
+            tensor<int32, [4]> var_1744 = const()[name = string("op_1744"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_37_cast_fp16 = reshape(shape = var_1744, x = current_key_17_cast_fp16)[name = string("inputs_37_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_37_cast_fp16 = mul(x = inputs_37_cast_fp16, y = inputs_37_cast_fp16)[name = string("inputs_sq_37_cast_fp16")];
+            tensor<int32, [1]> variance_37_axes_0 = const()[name = string("variance_37_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_37_keep_dims_0 = const()[name = string("variance_37_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_37_cast_fp16 = reduce_mean(axes = variance_37_axes_0, keep_dims = variance_37_keep_dims_0, x = inputs_sq_37_cast_fp16)[name = string("variance_37_cast_fp16")];
+            fp16 var_1750_to_fp16 = const()[name = string("op_1750_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1751_cast_fp16 = add(x = variance_37_cast_fp16, y = var_1750_to_fp16)[name = string("op_1751_cast_fp16")];
+            fp32 var_1752_epsilon_0 = const()[name = string("op_1752_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1752_cast_fp16 = rsqrt(epsilon = var_1752_epsilon_0, x = var_1751_cast_fp16)[name = string("op_1752_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_45_cast_fp16 = mul(x = inputs_37_cast_fp16, y = var_1752_cast_fp16)[name = string("hidden_states_45_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_37_to_fp16 = const()[name = string("w_37_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69265152)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_cast_fp16 = mul(x = w_37_to_fp16, y = hidden_states_45_cast_fp16)[name = string("current_key_normed_cast_fp16")];
+            tensor<int32, [4]> var_1770 = const()[name = string("op_1770"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_25_cast_fp16 = reshape(shape = var_1770, x = query_normed_cast_fp16)[name = string("mh_q_25_cast_fp16")];
+            tensor<int32, [4]> var_1772 = const()[name = string("op_1772"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_17_cast_fp16 = reshape(shape = var_1772, x = current_key_normed_cast_fp16)[name = string("mh_k_17_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1776_cast_fp16 = mul(x = mh_q_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1776_cast_fp16")];
+            tensor<int32, [4]> var_1781_begin_0 = const()[name = string("op_1781_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1781_end_0 = const()[name = string("op_1781_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1781_end_mask_0 = const()[name = string("op_1781_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1781_cast_fp16 = slice_by_index(begin = var_1781_begin_0, end = var_1781_end_0, end_mask = var_1781_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1781_cast_fp16")];
+            tensor<int32, [4]> var_1787_begin_0 = const()[name = string("op_1787_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1787_end_0 = const()[name = string("op_1787_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1787_end_mask_0 = const()[name = string("op_1787_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1787_cast_fp16 = slice_by_index(begin = var_1787_begin_0, end = var_1787_end_0, end_mask = var_1787_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1787_cast_fp16")];
+            fp16 const_109_promoted_to_fp16 = const()[name = string("const_109_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1789_cast_fp16 = mul(x = var_1787_cast_fp16, y = const_109_promoted_to_fp16)[name = string("op_1789_cast_fp16")];
+            bool var_1791_interleave_0 = const()[name = string("op_1791_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1791_cast_fp16 = concat(axis = var_1669, interleave = var_1791_interleave_0, values = (var_1789_cast_fp16, var_1781_cast_fp16))[name = string("op_1791_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1792_cast_fp16 = mul(x = var_1791_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1792_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_27_cast_fp16 = add(x = var_1776_cast_fp16, y = var_1792_cast_fp16)[name = string("mh_q_27_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1794_cast_fp16 = mul(x = mh_k_17_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1794_cast_fp16")];
+            tensor<int32, [4]> var_1799_begin_0 = const()[name = string("op_1799_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1799_end_0 = const()[name = string("op_1799_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1799_end_mask_0 = const()[name = string("op_1799_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1799_cast_fp16 = slice_by_index(begin = var_1799_begin_0, end = var_1799_end_0, end_mask = var_1799_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1799_cast_fp16")];
+            tensor<int32, [4]> var_1805_begin_0 = const()[name = string("op_1805_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1805_end_0 = const()[name = string("op_1805_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1805_end_mask_0 = const()[name = string("op_1805_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1805_cast_fp16 = slice_by_index(begin = var_1805_begin_0, end = var_1805_end_0, end_mask = var_1805_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1805_cast_fp16")];
+            fp16 const_112_promoted_to_fp16 = const()[name = string("const_112_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1807_cast_fp16 = mul(x = var_1805_cast_fp16, y = const_112_promoted_to_fp16)[name = string("op_1807_cast_fp16")];
+            bool var_1809_interleave_0 = const()[name = string("op_1809_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1809_cast_fp16 = concat(axis = var_1669, interleave = var_1809_interleave_0, values = (var_1807_cast_fp16, var_1799_cast_fp16))[name = string("op_1809_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1810_cast_fp16 = mul(x = var_1809_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1810_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_cast_fp16 = add(x = var_1794_cast_fp16, y = var_1810_cast_fp16)[name = string("mh_k_cast_fp16")];
+            tensor<int32, [4]> var_1814 = const()[name = string("op_1814"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_1814, x = mh_k_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1821_cast_fp16 = mul(x = var_96_cast_fp16_4, y = var_272_cast_fp16)[name = string("op_1821_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1822_cast_fp16 = mul(x = current_key_cast_fp16, y = var_270_cast_fp16)[name = string("op_1822_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_27_cast_fp16 = add(x = var_1821_cast_fp16, y = var_1822_cast_fp16)[name = string("key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1825_cast_fp16 = mul(x = var_104_cast_fp16_4, y = var_272_cast_fp16)[name = string("op_1825_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1826_cast_fp16 = mul(x = current_value_cast_fp16, y = var_270_cast_fp16)[name = string("op_1826_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_17_cast_fp16 = add(x = var_1825_cast_fp16, y = var_1826_cast_fp16)[name = string("value_17_cast_fp16")];
+            tensor<int32, [4]> var_1830 = const()[name = string("op_1830"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_17_cast_fp16 = reshape(shape = var_1830, x = key_27_cast_fp16)[name = string("key_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1832 = const()[name = string("op_1832"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_17_cast_fp16 = reshape(shape = var_1832, x = value_17_cast_fp16)[name = string("value_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1835_begin_0 = const()[name = string("op_1835_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1835_end_0 = const()[name = string("op_1835_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1835_end_mask_0 = const()[name = string("op_1835_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1835_cast_fp16 = slice_by_index(begin = var_1835_begin_0, end = var_1835_end_0, end_mask = var_1835_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1835_cast_fp16")];
+            tensor<int32, [4]> var_1839_begin_0 = const()[name = string("op_1839_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1839_end_0 = const()[name = string("op_1839_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1839_end_mask_0 = const()[name = string("op_1839_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1839_cast_fp16 = slice_by_index(begin = var_1839_begin_0, end = var_1839_end_0, end_mask = var_1839_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1839_cast_fp16")];
+            tensor<int32, [4]> var_1851_begin_0 = const()[name = string("op_1851_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1851_end_0 = const()[name = string("op_1851_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1851_end_mask_0 = const()[name = string("op_1851_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1851_cast_fp16 = slice_by_index(begin = var_1851_begin_0, end = var_1851_end_0, end_mask = var_1851_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1851_cast_fp16")];
+            tensor<int32, [4]> var_1855_begin_0 = const()[name = string("op_1855_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1855_end_0 = const()[name = string("op_1855_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1855_end_mask_0 = const()[name = string("op_1855_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1855_cast_fp16 = slice_by_index(begin = var_1855_begin_0, end = var_1855_end_0, end_mask = var_1855_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1855_cast_fp16")];
+            tensor<int32, [4]> var_1867_begin_0 = const()[name = string("op_1867_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1867_end_0 = const()[name = string("op_1867_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1867_end_mask_0 = const()[name = string("op_1867_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1867_cast_fp16 = slice_by_index(begin = var_1867_begin_0, end = var_1867_end_0, end_mask = var_1867_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1867_cast_fp16")];
+            tensor<int32, [4]> var_1871_begin_0 = const()[name = string("op_1871_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1871_end_0 = const()[name = string("op_1871_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1871_end_mask_0 = const()[name = string("op_1871_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1871_cast_fp16 = slice_by_index(begin = var_1871_begin_0, end = var_1871_end_0, end_mask = var_1871_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1871_cast_fp16")];
+            tensor<int32, [4]> var_1883_begin_0 = const()[name = string("op_1883_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1883_end_0 = const()[name = string("op_1883_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1883_end_mask_0 = const()[name = string("op_1883_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1883_cast_fp16 = slice_by_index(begin = var_1883_begin_0, end = var_1883_end_0, end_mask = var_1883_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1883_cast_fp16")];
+            tensor<int32, [4]> var_1887_begin_0 = const()[name = string("op_1887_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1887_end_0 = const()[name = string("op_1887_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1887_end_mask_0 = const()[name = string("op_1887_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1887_cast_fp16 = slice_by_index(begin = var_1887_begin_0, end = var_1887_end_0, end_mask = var_1887_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1887_cast_fp16")];
+            tensor<int32, [4]> var_1899_begin_0 = const()[name = string("op_1899_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1899_end_0 = const()[name = string("op_1899_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1899_end_mask_0 = const()[name = string("op_1899_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1899_cast_fp16 = slice_by_index(begin = var_1899_begin_0, end = var_1899_end_0, end_mask = var_1899_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1899_cast_fp16")];
+            tensor<int32, [4]> var_1903_begin_0 = const()[name = string("op_1903_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1903_end_0 = const()[name = string("op_1903_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1903_end_mask_0 = const()[name = string("op_1903_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1903_cast_fp16 = slice_by_index(begin = var_1903_begin_0, end = var_1903_end_0, end_mask = var_1903_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1903_cast_fp16")];
+            tensor<int32, [4]> var_1915_begin_0 = const()[name = string("op_1915_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1915_end_0 = const()[name = string("op_1915_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1915_end_mask_0 = const()[name = string("op_1915_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1915_cast_fp16 = slice_by_index(begin = var_1915_begin_0, end = var_1915_end_0, end_mask = var_1915_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1915_cast_fp16")];
+            tensor<int32, [4]> var_1919_begin_0 = const()[name = string("op_1919_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1919_end_0 = const()[name = string("op_1919_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1919_end_mask_0 = const()[name = string("op_1919_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1919_cast_fp16 = slice_by_index(begin = var_1919_begin_0, end = var_1919_end_0, end_mask = var_1919_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1919_cast_fp16")];
+            tensor<int32, [4]> var_1931_begin_0 = const()[name = string("op_1931_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1931_end_0 = const()[name = string("op_1931_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1931_end_mask_0 = const()[name = string("op_1931_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1931_cast_fp16 = slice_by_index(begin = var_1931_begin_0, end = var_1931_end_0, end_mask = var_1931_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1931_cast_fp16")];
+            tensor<int32, [4]> var_1935_begin_0 = const()[name = string("op_1935_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1935_end_0 = const()[name = string("op_1935_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1935_end_mask_0 = const()[name = string("op_1935_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1935_cast_fp16 = slice_by_index(begin = var_1935_begin_0, end = var_1935_end_0, end_mask = var_1935_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1935_cast_fp16")];
+            tensor<int32, [4]> var_1947_begin_0 = const()[name = string("op_1947_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1947_end_0 = const()[name = string("op_1947_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1947_end_mask_0 = const()[name = string("op_1947_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1947_cast_fp16 = slice_by_index(begin = var_1947_begin_0, end = var_1947_end_0, end_mask = var_1947_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1947_cast_fp16")];
+            tensor<int32, [4]> var_1951_begin_0 = const()[name = string("op_1951_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1951_end_0 = const()[name = string("op_1951_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1951_end_mask_0 = const()[name = string("op_1951_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1951_cast_fp16 = slice_by_index(begin = var_1951_begin_0, end = var_1951_end_0, end_mask = var_1951_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1951_cast_fp16")];
+            bool key_heads_interleave_0 = const()[name = string("key_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_cast_fp16 = concat(axis = var_1677, interleave = key_heads_interleave_0, values = (var_1835_cast_fp16, var_1835_cast_fp16, var_1851_cast_fp16, var_1851_cast_fp16, var_1867_cast_fp16, var_1867_cast_fp16, var_1883_cast_fp16, var_1883_cast_fp16, var_1899_cast_fp16, var_1899_cast_fp16, var_1915_cast_fp16, var_1915_cast_fp16, var_1931_cast_fp16, var_1931_cast_fp16, var_1947_cast_fp16, var_1947_cast_fp16))[name = string("key_heads_cast_fp16")];
+            bool value_heads_interleave_0 = const()[name = string("value_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_cast_fp16 = concat(axis = var_1677, interleave = value_heads_interleave_0, values = (var_1839_cast_fp16, var_1839_cast_fp16, var_1855_cast_fp16, var_1855_cast_fp16, var_1871_cast_fp16, var_1871_cast_fp16, var_1887_cast_fp16, var_1887_cast_fp16, var_1903_cast_fp16, var_1903_cast_fp16, var_1919_cast_fp16, var_1919_cast_fp16, var_1935_cast_fp16, var_1935_cast_fp16, var_1951_cast_fp16, var_1951_cast_fp16))[name = string("value_heads_cast_fp16")];
+            fp16 var_1974_to_fp16 = const()[name = string("op_1974_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1975_cast_fp16 = mul(x = mh_q_27_cast_fp16, y = var_1974_to_fp16)[name = string("op_1975_cast_fp16")];
+            bool mh_w_17_transpose_x_0 = const()[name = string("mh_w_17_transpose_x_0"), val = bool(true)];
+            bool mh_w_17_transpose_y_0 = const()[name = string("mh_w_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_17_cast_fp16 = matmul(transpose_x = mh_w_17_transpose_x_0, transpose_y = mh_w_17_transpose_y_0, x = var_1975_cast_fp16, y = key_heads_cast_fp16)[name = string("mh_w_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_cast_fp16 = add(x = mh_w_17_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1987_cast_fp16 = softmax(axis = var_1659, x = mh_w_cast_fp16)[name = string("op_1987_cast_fp16")];
+            bool attn_transpose_x_0 = const()[name = string("attn_transpose_x_0"), val = bool(false)];
+            bool attn_transpose_y_0 = const()[name = string("attn_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_cast_fp16 = matmul(transpose_x = attn_transpose_x_0, transpose_y = attn_transpose_y_0, x = value_heads_cast_fp16, y = var_1987_cast_fp16)[name = string("attn_cast_fp16")];
+            tensor<int32, [4]> var_1992 = const()[name = string("op_1992"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_33_cast_fp16 = reshape(shape = var_1992, x = attn_cast_fp16)[name = string("input_33_cast_fp16")];
+            string obj_pad_type_0 = const()[name = string("obj_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_strides_0 = const()[name = string("obj_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_pad_0 = const()[name = string("obj_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_dilations_0 = const()[name = string("obj_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_groups_0 = const()[name = string("obj_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69265472))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71362688))))[name = string("layers_4_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_dilations_0, groups = obj_groups_0, pad = obj_pad_0, pad_type = obj_pad_type_0, strides = obj_strides_0, weight = layers_4_self_attn_o_proj_weight_to_fp16_palettized, x = input_33_cast_fp16)[name = string("obj_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_39_cast_fp16 = add(x = inputs_33_cast_fp16, y = obj_cast_fp16)[name = string("inputs_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_39_cast_fp16 = mul(x = inputs_39_cast_fp16, y = inputs_39_cast_fp16)[name = string("inputs_sq_39_cast_fp16")];
+            tensor<int32, [1]> variance_39_axes_0 = const()[name = string("variance_39_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_39_keep_dims_0 = const()[name = string("variance_39_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_39_cast_fp16 = reduce_mean(axes = variance_39_axes_0, keep_dims = variance_39_keep_dims_0, x = inputs_sq_39_cast_fp16)[name = string("variance_39_cast_fp16")];
+            fp16 var_2010_to_fp16 = const()[name = string("op_2010_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2011_cast_fp16 = add(x = variance_39_cast_fp16, y = var_2010_to_fp16)[name = string("op_2011_cast_fp16")];
+            fp32 var_2012_epsilon_0 = const()[name = string("op_2012_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2012_cast_fp16 = rsqrt(epsilon = var_2012_epsilon_0, x = var_2011_cast_fp16)[name = string("op_2012_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_47_cast_fp16 = mul(x = inputs_39_cast_fp16, y = var_2012_cast_fp16)[name = string("hidden_states_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_39_to_fp16 = const()[name = string("w_39_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71363264)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_35_cast_fp16 = mul(x = w_39_to_fp16, y = hidden_states_47_cast_fp16)[name = string("input_35_cast_fp16")];
+            string input_37_pad_type_0 = const()[name = string("input_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_37_strides_0 = const()[name = string("input_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_37_dilations_0 = const()[name = string("input_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_37_groups_0 = const()[name = string("input_37_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71365376))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74511168))))[name = string("layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_37_cast_fp16 = conv(dilations = input_37_dilations_0, groups = input_37_groups_0, pad = input_37_pad_0, pad_type = input_37_pad_type_0, strides = input_37_strides_0, weight = layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("input_37_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2026_cast_fp16 = silu(x = input_37_cast_fp16)[name = string("op_2026_cast_fp16")];
+            string var_2032_pad_type_0 = const()[name = string("op_2032_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2032_strides_0 = const()[name = string("op_2032_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2032_pad_0 = const()[name = string("op_2032_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2032_dilations_0 = const()[name = string("op_2032_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2032_groups_0 = const()[name = string("op_2032_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74511744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77657536))))[name = string("layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2032_cast_fp16 = conv(dilations = var_2032_dilations_0, groups = var_2032_groups_0, pad = var_2032_pad_0, pad_type = var_2032_pad_type_0, strides = var_2032_strides_0, weight = layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_2032_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_39_cast_fp16 = mul(x = var_2026_cast_fp16, y = var_2032_cast_fp16)[name = string("input_39_cast_fp16")];
+            string hidden_states_49_pad_type_0 = const()[name = string("hidden_states_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_49_strides_0 = const()[name = string("hidden_states_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_49_pad_0 = const()[name = string("hidden_states_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_49_dilations_0 = const()[name = string("hidden_states_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_49_groups_0 = const()[name = string("hidden_states_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_4_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77658112))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80803904))))[name = string("layers_4_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_49_cast_fp16 = conv(dilations = hidden_states_49_dilations_0, groups = hidden_states_49_groups_0, pad = hidden_states_49_pad_0, pad_type = hidden_states_49_pad_type_0, strides = hidden_states_49_strides_0, weight = layers_4_mlp_down_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_cast_fp16 = add(x = inputs_39_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_2053_to_fp16 = const()[name = string("op_2053_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2054_cast_fp16 = add(x = variance_cast_fp16, y = var_2053_to_fp16)[name = string("op_2054_cast_fp16")];
+            fp32 var_2055_epsilon_0 = const()[name = string("op_2055_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2055_cast_fp16 = rsqrt(epsilon = var_2055_epsilon_0, x = var_2054_cast_fp16)[name = string("op_2055_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_2055_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80804480)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_cast_fp16 = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_cast_fp16")];
+            string logits_1_pad_type_0 = const()[name = string("logits_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_1_strides_0 = const()[name = string("logits_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_1_pad_0 = const()[name = string("logits_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_1_dilations_0 = const()[name = string("logits_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_1_groups_0 = const()[name = string("logits_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_0_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80806592))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82903808))))[name = string("lm_heads_0_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_1_cast_fp16 = conv(dilations = logits_1_dilations_0, groups = logits_1_groups_0, pad = logits_1_pad_0, pad_type = logits_1_pad_type_0, strides = logits_1_strides_0, weight = lm_heads_0_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_1_cast_fp16")];
+            tensor<int32, [1]> var_2072_axes_0 = const()[name = string("op_2072_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2072_cast_fp16 = squeeze(axes = var_2072_axes_0, x = logits_1_cast_fp16)[name = string("op_2072_cast_fp16")];
+            string logits_3_pad_type_0 = const()[name = string("logits_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_3_strides_0 = const()[name = string("logits_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_3_pad_0 = const()[name = string("logits_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_3_dilations_0 = const()[name = string("logits_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_3_groups_0 = const()[name = string("logits_3_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82904384))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85001600))))[name = string("lm_heads_1_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_3_cast_fp16 = conv(dilations = logits_3_dilations_0, groups = logits_3_groups_0, pad = logits_3_pad_0, pad_type = logits_3_pad_type_0, strides = logits_3_strides_0, weight = lm_heads_1_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_3_cast_fp16")];
+            tensor<int32, [1]> var_2088_axes_0 = const()[name = string("op_2088_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2088_cast_fp16 = squeeze(axes = var_2088_axes_0, x = logits_3_cast_fp16)[name = string("op_2088_cast_fp16")];
+            string logits_5_pad_type_0 = const()[name = string("logits_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_5_strides_0 = const()[name = string("logits_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_5_pad_0 = const()[name = string("logits_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_5_dilations_0 = const()[name = string("logits_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_5_groups_0 = const()[name = string("logits_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_2_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85002176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87099392))))[name = string("lm_heads_2_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_5_cast_fp16 = conv(dilations = logits_5_dilations_0, groups = logits_5_groups_0, pad = logits_5_pad_0, pad_type = logits_5_pad_type_0, strides = logits_5_strides_0, weight = lm_heads_2_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_5_cast_fp16")];
+            tensor<int32, [1]> var_2104_axes_0 = const()[name = string("op_2104_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2104_cast_fp16 = squeeze(axes = var_2104_axes_0, x = logits_5_cast_fp16)[name = string("op_2104_cast_fp16")];
+            string logits_7_pad_type_0 = const()[name = string("logits_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_7_strides_0 = const()[name = string("logits_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_7_pad_0 = const()[name = string("logits_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_7_dilations_0 = const()[name = string("logits_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_7_groups_0 = const()[name = string("logits_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_3_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87099968))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89197184))))[name = string("lm_heads_3_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_7_cast_fp16 = conv(dilations = logits_7_dilations_0, groups = logits_7_groups_0, pad = logits_7_pad_0, pad_type = logits_7_pad_type_0, strides = logits_7_strides_0, weight = lm_heads_3_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_7_cast_fp16")];
+            tensor<int32, [1]> var_2120_axes_0 = const()[name = string("op_2120_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2120_cast_fp16 = squeeze(axes = var_2120_axes_0, x = logits_7_cast_fp16)[name = string("op_2120_cast_fp16")];
+            string logits_9_pad_type_0 = const()[name = string("logits_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_9_strides_0 = const()[name = string("logits_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_9_pad_0 = const()[name = string("logits_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_9_dilations_0 = const()[name = string("logits_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_9_groups_0 = const()[name = string("logits_9_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_4_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89197760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91294976))))[name = string("lm_heads_4_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_9_cast_fp16 = conv(dilations = logits_9_dilations_0, groups = logits_9_groups_0, pad = logits_9_pad_0, pad_type = logits_9_pad_type_0, strides = logits_9_strides_0, weight = lm_heads_4_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_9_cast_fp16")];
+            tensor<int32, [1]> var_2136_axes_0 = const()[name = string("op_2136_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2136_cast_fp16 = squeeze(axes = var_2136_axes_0, x = logits_9_cast_fp16)[name = string("op_2136_cast_fp16")];
+            string logits_11_pad_type_0 = const()[name = string("logits_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_11_strides_0 = const()[name = string("logits_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_11_pad_0 = const()[name = string("logits_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_11_dilations_0 = const()[name = string("logits_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_11_groups_0 = const()[name = string("logits_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_5_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91295552))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93392768))))[name = string("lm_heads_5_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_11_cast_fp16 = conv(dilations = logits_11_dilations_0, groups = logits_11_groups_0, pad = logits_11_pad_0, pad_type = logits_11_pad_type_0, strides = logits_11_strides_0, weight = lm_heads_5_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_11_cast_fp16")];
+            tensor<int32, [1]> var_2152_axes_0 = const()[name = string("op_2152_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2152_cast_fp16 = squeeze(axes = var_2152_axes_0, x = logits_11_cast_fp16)[name = string("op_2152_cast_fp16")];
+            string logits_13_pad_type_0 = const()[name = string("logits_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_13_strides_0 = const()[name = string("logits_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_13_pad_0 = const()[name = string("logits_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_13_dilations_0 = const()[name = string("logits_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_13_groups_0 = const()[name = string("logits_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_6_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93393344))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95490560))))[name = string("lm_heads_6_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_13_cast_fp16 = conv(dilations = logits_13_dilations_0, groups = logits_13_groups_0, pad = logits_13_pad_0, pad_type = logits_13_pad_type_0, strides = logits_13_strides_0, weight = lm_heads_6_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_13_cast_fp16")];
+            tensor<int32, [1]> var_2168_axes_0 = const()[name = string("op_2168_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2168_cast_fp16 = squeeze(axes = var_2168_axes_0, x = logits_13_cast_fp16)[name = string("op_2168_cast_fp16")];
+            string logits_15_pad_type_0 = const()[name = string("logits_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_15_strides_0 = const()[name = string("logits_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_15_pad_0 = const()[name = string("logits_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_15_dilations_0 = const()[name = string("logits_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_15_groups_0 = const()[name = string("logits_15_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_7_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95491136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97588352))))[name = string("lm_heads_7_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_15_cast_fp16 = conv(dilations = logits_15_dilations_0, groups = logits_15_groups_0, pad = logits_15_pad_0, pad_type = logits_15_pad_type_0, strides = logits_15_strides_0, weight = lm_heads_7_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_15_cast_fp16")];
+            tensor<int32, [1]> var_2184_axes_0 = const()[name = string("op_2184_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2184_cast_fp16 = squeeze(axes = var_2184_axes_0, x = logits_15_cast_fp16)[name = string("op_2184_cast_fp16")];
+            string logits_17_pad_type_0 = const()[name = string("logits_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_17_strides_0 = const()[name = string("logits_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_17_pad_0 = const()[name = string("logits_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_17_dilations_0 = const()[name = string("logits_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_17_groups_0 = const()[name = string("logits_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_8_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97588928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99686144))))[name = string("lm_heads_8_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_17_cast_fp16 = conv(dilations = logits_17_dilations_0, groups = logits_17_groups_0, pad = logits_17_pad_0, pad_type = logits_17_pad_type_0, strides = logits_17_strides_0, weight = lm_heads_8_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_17_cast_fp16")];
+            tensor<int32, [1]> var_2200_axes_0 = const()[name = string("op_2200_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2200_cast_fp16 = squeeze(axes = var_2200_axes_0, x = logits_17_cast_fp16)[name = string("op_2200_cast_fp16")];
+            string logits_19_pad_type_0 = const()[name = string("logits_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_19_strides_0 = const()[name = string("logits_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_19_pad_0 = const()[name = string("logits_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_19_dilations_0 = const()[name = string("logits_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_19_groups_0 = const()[name = string("logits_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_9_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99686720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101783936))))[name = string("lm_heads_9_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_19_cast_fp16 = conv(dilations = logits_19_dilations_0, groups = logits_19_groups_0, pad = logits_19_pad_0, pad_type = logits_19_pad_type_0, strides = logits_19_strides_0, weight = lm_heads_9_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_19_cast_fp16")];
+            tensor<int32, [1]> var_2216_axes_0 = const()[name = string("op_2216_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2216_cast_fp16 = squeeze(axes = var_2216_axes_0, x = logits_19_cast_fp16)[name = string("op_2216_cast_fp16")];
+            string logits_21_pad_type_0 = const()[name = string("logits_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_21_strides_0 = const()[name = string("logits_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_21_pad_0 = const()[name = string("logits_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_21_dilations_0 = const()[name = string("logits_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_21_groups_0 = const()[name = string("logits_21_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_10_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101784512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103881728))))[name = string("lm_heads_10_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_21_cast_fp16 = conv(dilations = logits_21_dilations_0, groups = logits_21_groups_0, pad = logits_21_pad_0, pad_type = logits_21_pad_type_0, strides = logits_21_strides_0, weight = lm_heads_10_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_21_cast_fp16")];
+            tensor<int32, [1]> var_2232_axes_0 = const()[name = string("op_2232_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2232_cast_fp16 = squeeze(axes = var_2232_axes_0, x = logits_21_cast_fp16)[name = string("op_2232_cast_fp16")];
+            string logits_23_pad_type_0 = const()[name = string("logits_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_23_strides_0 = const()[name = string("logits_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_23_pad_0 = const()[name = string("logits_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_23_dilations_0 = const()[name = string("logits_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_23_groups_0 = const()[name = string("logits_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_11_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103882304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105979520))))[name = string("lm_heads_11_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_23_cast_fp16 = conv(dilations = logits_23_dilations_0, groups = logits_23_groups_0, pad = logits_23_pad_0, pad_type = logits_23_pad_type_0, strides = logits_23_strides_0, weight = lm_heads_11_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_23_cast_fp16")];
+            tensor<int32, [1]> var_2248_axes_0 = const()[name = string("op_2248_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2248_cast_fp16 = squeeze(axes = var_2248_axes_0, x = logits_23_cast_fp16)[name = string("op_2248_cast_fp16")];
+            string logits_25_pad_type_0 = const()[name = string("logits_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_25_strides_0 = const()[name = string("logits_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_25_pad_0 = const()[name = string("logits_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_25_dilations_0 = const()[name = string("logits_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_25_groups_0 = const()[name = string("logits_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_12_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105980096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108077312))))[name = string("lm_heads_12_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_25_cast_fp16 = conv(dilations = logits_25_dilations_0, groups = logits_25_groups_0, pad = logits_25_pad_0, pad_type = logits_25_pad_type_0, strides = logits_25_strides_0, weight = lm_heads_12_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_25_cast_fp16")];
+            tensor<int32, [1]> var_2264_axes_0 = const()[name = string("op_2264_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2264_cast_fp16 = squeeze(axes = var_2264_axes_0, x = logits_25_cast_fp16)[name = string("op_2264_cast_fp16")];
+            string logits_27_pad_type_0 = const()[name = string("logits_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_27_strides_0 = const()[name = string("logits_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_27_pad_0 = const()[name = string("logits_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_27_dilations_0 = const()[name = string("logits_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_27_groups_0 = const()[name = string("logits_27_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_13_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108077888))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110175104))))[name = string("lm_heads_13_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_27_cast_fp16 = conv(dilations = logits_27_dilations_0, groups = logits_27_groups_0, pad = logits_27_pad_0, pad_type = logits_27_pad_type_0, strides = logits_27_strides_0, weight = lm_heads_13_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_27_cast_fp16")];
+            tensor<int32, [1]> var_2280_axes_0 = const()[name = string("op_2280_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2280_cast_fp16 = squeeze(axes = var_2280_axes_0, x = logits_27_cast_fp16)[name = string("op_2280_cast_fp16")];
+            string logits_29_pad_type_0 = const()[name = string("logits_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_29_strides_0 = const()[name = string("logits_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_29_pad_0 = const()[name = string("logits_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_29_dilations_0 = const()[name = string("logits_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_29_groups_0 = const()[name = string("logits_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_14_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110175680))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112272896))))[name = string("lm_heads_14_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_29_cast_fp16 = conv(dilations = logits_29_dilations_0, groups = logits_29_groups_0, pad = logits_29_pad_0, pad_type = logits_29_pad_type_0, strides = logits_29_strides_0, weight = lm_heads_14_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_29_cast_fp16")];
+            tensor<int32, [1]> var_2296_axes_0 = const()[name = string("op_2296_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2296_cast_fp16 = squeeze(axes = var_2296_axes_0, x = logits_29_cast_fp16)[name = string("op_2296_cast_fp16")];
+            bool var_2302_interleave_0 = const()[name = string("op_2302_interleave_0"), val = bool(false)];
+            int32 const_119 = const()[name = string("const_119"), val = int32(2)];
+            tensor<fp16, [1, 2048, 15]> var_2302_cast_fp16 = concat(axis = const_119, interleave = var_2302_interleave_0, values = (var_2072_cast_fp16, var_2088_cast_fp16, var_2104_cast_fp16, var_2120_cast_fp16, var_2136_cast_fp16, var_2152_cast_fp16, var_2168_cast_fp16, var_2184_cast_fp16, var_2200_cast_fp16, var_2216_cast_fp16, var_2232_cast_fp16, var_2248_cast_fp16, var_2264_cast_fp16, var_2280_cast_fp16, var_2296_cast_fp16))[name = string("op_2302_cast_fp16")];
+            int32 var_2304 = const()[name = string("op_2304"), val = int32(1)];
+            bool var_2305_interleave_0 = const()[name = string("op_2305_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> key_cache_updates = concat(axis = var_2304, interleave = var_2305_interleave_0, values = (current_key_3_cast_fp16, current_key_7_cast_fp16, current_key_11_cast_fp16, current_key_15_cast_fp16, current_key_cast_fp16))[name = string("op_2305_cast_fp16")];
+            int32 var_2307 = const()[name = string("op_2307"), val = int32(1)];
+            bool var_2308_interleave_0 = const()[name = string("op_2308_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> value_cache_updates = concat(axis = var_2307, interleave = var_2308_interleave_0, values = (current_value_1_cast_fp16, current_value_3_cast_fp16, current_value_5_cast_fp16, current_value_7_cast_fp16, current_value_cast_fp16))[name = string("op_2308_cast_fp16")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [1, 15, 2048]> all_logits = transpose(perm = transpose_0_perm_0, x = var_2302_cast_fp16)[name = string("transpose_0")];
+        } -> (all_logits, key_cache_updates, value_cache_updates);
+}
\ No newline at end of file