diff --git "a/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil" "b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil"
new file mode 100644--- /dev/null
+++ "b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil"
@@ -0,0 +1,1369 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> cache_length, tensor<fp16, [1, 1024, 1, 1]> input_embeds, tensor<fp16, [1, 5120, 1, 16]> key_cache, tensor<fp16, [1, 16]> key_padding_mask, tensor<fp16, [1, 16]> kv_cache_update_mask, tensor<fp16, [1, 5120, 1, 16]> value_cache) {
+            int32 pos_cos_batch_dims_0 = const()[name = string("pos_cos_batch_dims_0"), val = int32(0)];
+            bool pos_cos_validate_indices_0 = const()[name = string("pos_cos_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_cos_weight_to_fp16 = const()[name = string("position_embeddings_cos_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string cache_length_to_int16_dtype_0 = const()[name = string("cache_length_to_int16_dtype_0"), val = string("int16")];
+            string cast_111_dtype_0 = const()[name = string("cast_111_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> cache_length_to_int16 = cast(dtype = cache_length_to_int16_dtype_0, x = cache_length)[name = string("cast_5")];
+            tensor<int32, [1]> cast_111 = cast(dtype = cast_111_dtype_0, x = cache_length_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_111, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(16)];
+            tensor<int32, [1]> add_0 = add(x = cast_111, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_111, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            string cast_0_dtype_0 = const()[name = string("cast_0_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<int32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = select_0_to_int16)[name = string("cast_2")];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = cast_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(16)];
+            tensor<int32, [1]> add_0_1 = add(x = cast_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = cast_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0 = const()[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 128]> pos_cos_cast_fp16_cast_uint16_cast_uint16 = gather(axis = pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0, batch_dims = pos_cos_batch_dims_0, indices = select_0_1, validate_indices = pos_cos_validate_indices_0, x = position_embeddings_cos_weight_to_fp16)[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> obj_7_axes_0 = const()[name = string("obj_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_7_cast_fp16 = expand_dims(axes = obj_7_axes_0, x = pos_cos_cast_fp16_cast_uint16_cast_uint16)[name = string("obj_7_cast_fp16")];
+            int32 pos_sin_axis_0 = const()[name = string("pos_sin_axis_0"), val = int32(0)];
+            int32 pos_sin_batch_dims_0 = const()[name = string("pos_sin_batch_dims_0"), val = int32(0)];
+            bool pos_sin_validate_indices_0 = const()[name = string("pos_sin_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_sin_weight_to_fp16 = const()[name = string("position_embeddings_sin_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4224)))];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_1")];
+            tensor<fp16, [1, 128]> pos_sin_cast_fp16_cast_uint16 = gather(axis = pos_sin_axis_0, batch_dims = pos_sin_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_validate_indices_0, x = position_embeddings_sin_weight_to_fp16)[name = string("pos_sin_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> obj_9_axes_0 = const()[name = string("obj_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_9_cast_fp16 = expand_dims(axes = obj_9_axes_0, x = pos_sin_cast_fp16_cast_uint16)[name = string("obj_9_cast_fp16")];
+            tensor<int32, [5]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_84_axis_0 = const()[name = string("op_84_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_4 = split(axis = var_84_axis_0, split_sizes = tile_0, x = key_cache)[name = string("op_84_cast_fp16")];
+            tensor<int32, [5]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_92_axis_0 = const()[name = string("op_92_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_4 = split(axis = var_92_axis_0, split_sizes = tile_1, x = value_cache)[name = string("op_92_cast_fp16")];
+            int32 var_99 = const()[name = string("op_99"), val = int32(3)];
+            int32 var_109 = const()[name = string("op_109"), val = int32(-2)];
+            int32 var_117 = const()[name = string("op_117"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = input_embeds, y = input_embeds)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_129_to_fp16 = const()[name = string("op_129_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_130_cast_fp16 = add(x = variance_1_cast_fp16, y = var_129_to_fp16)[name = string("op_130_cast_fp16")];
+            fp32 var_131_epsilon_0 = const()[name = string("op_131_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_131_cast_fp16 = rsqrt(epsilon = var_131_epsilon_0, x = var_130_cast_fp16)[name = string("op_131_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_1_cast_fp16 = mul(x = input_embeds, y = var_131_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8384)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_1_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("obj_1_cast_fp16")];
+            string query_1_pad_type_0 = const()[name = string("query_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_1_strides_0 = const()[name = string("query_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_1_pad_0 = const()[name = string("query_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_1_dilations_0 = const()[name = string("query_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_1_groups_0 = const()[name = string("query_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2107712))))[name = string("layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> layers_0_self_attn_q_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_q_proj_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2108288)))];
+            tensor<fp16, [1, 2048, 1, 1]> query_1_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_1_dilations_0, groups = query_1_groups_0, pad = query_1_pad_0, pad_type = query_1_pad_type_0, strides = query_1_strides_0, weight = layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("query_1_cast_fp16")];
+            string current_key_1_pad_type_0 = const()[name = string("current_key_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_1_strides_0 = const()[name = string("current_key_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_1_pad_0 = const()[name = string("current_key_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_1_dilations_0 = const()[name = string("current_key_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_1_groups_0 = const()[name = string("current_key_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2112448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3161088))))[name = string("layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = conv(dilations = current_key_1_dilations_0, groups = current_key_1_groups_0, pad = current_key_1_pad_0, pad_type = current_key_1_pad_type_0, strides = current_key_1_strides_0, weight = layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            string current_value_1_pad_type_0 = const()[name = string("current_value_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_1_strides_0 = const()[name = string("current_value_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_1_pad_0 = const()[name = string("current_value_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_1_dilations_0 = const()[name = string("current_value_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_1_groups_0 = const()[name = string("current_value_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3161664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4210304))))[name = string("layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> layers_0_self_attn_v_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_v_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4210880)))];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_1_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_1_dilations_0, groups = current_value_1_groups_0, pad = current_value_1_pad_0, pad_type = current_value_1_pad_type_0, strides = current_value_1_strides_0, weight = layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_value_1_cast_fp16")];
+            tensor<int32, [4]> var_168 = const()[name = string("op_168"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_1_cast_fp16 = reshape(shape = var_168, x = query_1_cast_fp16)[name = string("inputs_1_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_174_to_fp16 = const()[name = string("op_174_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_175_cast_fp16 = add(x = variance_3_cast_fp16, y = var_174_to_fp16)[name = string("op_175_cast_fp16")];
+            fp32 var_176_epsilon_0 = const()[name = string("op_176_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_176_cast_fp16 = rsqrt(epsilon = var_176_epsilon_0, x = var_175_cast_fp16)[name = string("op_176_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_176_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4212992)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_1_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("query_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_184 = const()[name = string("op_184"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_3_cast_fp16 = reshape(shape = var_184, x = current_key_1_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_190_to_fp16 = const()[name = string("op_190_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_191_cast_fp16 = add(x = variance_5_cast_fp16, y = var_190_to_fp16)[name = string("op_191_cast_fp16")];
+            fp32 var_192_epsilon_0 = const()[name = string("op_192_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_192_cast_fp16 = rsqrt(epsilon = var_192_epsilon_0, x = var_191_cast_fp16)[name = string("op_192_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_192_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4213312)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_1_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("current_key_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_210 = const()[name = string("op_210"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_1_cast_fp16 = reshape(shape = var_210, x = query_normed_1_cast_fp16)[name = string("mh_q_1_cast_fp16")];
+            tensor<int32, [4]> var_212 = const()[name = string("op_212"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_1_cast_fp16 = reshape(shape = var_212, x = current_key_normed_1_cast_fp16)[name = string("mh_k_1_cast_fp16")];
+            tensor<int32, [1]> cos_1_axes_0 = const()[name = string("cos_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> cos_1_cast_fp16 = expand_dims(axes = cos_1_axes_0, x = obj_7_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [1]> sin_1_axes_0 = const()[name = string("sin_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> sin_1_cast_fp16 = expand_dims(axes = sin_1_axes_0, x = obj_9_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_216_cast_fp16 = mul(x = mh_q_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_216_cast_fp16")];
+            tensor<int32, [4]> var_221_begin_0 = const()[name = string("op_221_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_221_end_0 = const()[name = string("op_221_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_221_end_mask_0 = const()[name = string("op_221_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_221_cast_fp16 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_221_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = string("op_227_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = string("op_227_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = string("op_227_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_227_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_229_cast_fp16 = mul(x = var_227_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_229_cast_fp16")];
+            bool var_231_interleave_0 = const()[name = string("op_231_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_231_cast_fp16 = concat(axis = var_109, interleave = var_231_interleave_0, values = (var_229_cast_fp16, var_221_cast_fp16))[name = string("op_231_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_232_cast_fp16 = mul(x = var_231_cast_fp16, y = sin_1_cast_fp16)[name = string("op_232_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_3_cast_fp16 = add(x = var_216_cast_fp16, y = var_232_cast_fp16)[name = string("mh_q_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_234_cast_fp16 = mul(x = mh_k_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_234_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = string("op_239_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = string("op_239_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = string("op_239_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_239_cast_fp16")];
+            tensor<int32, [4]> var_245_begin_0 = const()[name = string("op_245_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_245_end_0 = const()[name = string("op_245_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_245_end_mask_0 = const()[name = string("op_245_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_245_cast_fp16 = slice_by_index(begin = var_245_begin_0, end = var_245_end_0, end_mask = var_245_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_245_cast_fp16")];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_247_cast_fp16 = mul(x = var_245_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_247_cast_fp16")];
+            bool var_249_interleave_0 = const()[name = string("op_249_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_249_cast_fp16 = concat(axis = var_109, interleave = var_249_interleave_0, values = (var_247_cast_fp16, var_239_cast_fp16))[name = string("op_249_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_250_cast_fp16 = mul(x = var_249_cast_fp16, y = sin_1_cast_fp16)[name = string("op_250_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_3_cast_fp16 = add(x = var_234_cast_fp16, y = var_250_cast_fp16)[name = string("mh_k_3_cast_fp16")];
+            tensor<int32, [4]> var_254 = const()[name = string("op_254"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_254, x = mh_k_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<int32, [1]> var_257_axes_0 = const()[name = string("op_257_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_257_cast_fp16 = expand_dims(axes = var_257_axes_0, x = kv_cache_update_mask)[name = string("op_257_cast_fp16")];
+            tensor<int32, [1]> var_258_axes_0 = const()[name = string("op_258_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_258_cast_fp16 = expand_dims(axes = var_258_axes_0, x = var_257_cast_fp16)[name = string("op_258_cast_fp16")];
+            fp16 var_110_to_fp16 = const()[name = string("op_110_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 16]> var_260_cast_fp16 = sub(x = var_110_to_fp16, y = var_258_cast_fp16)[name = string("op_260_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_261_cast_fp16 = mul(x = var_84_cast_fp16_0, y = var_260_cast_fp16)[name = string("op_261_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_262_cast_fp16 = mul(x = current_key_3_cast_fp16, y = var_258_cast_fp16)[name = string("op_262_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_3_cast_fp16 = add(x = var_261_cast_fp16, y = var_262_cast_fp16)[name = string("key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_265_cast_fp16 = mul(x = var_92_cast_fp16_0, y = var_260_cast_fp16)[name = string("op_265_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_266_cast_fp16 = mul(x = current_value_1_cast_fp16, y = var_258_cast_fp16)[name = string("op_266_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_1_cast_fp16 = add(x = var_265_cast_fp16, y = var_266_cast_fp16)[name = string("value_1_cast_fp16")];
+            tensor<int32, [4]> var_270 = const()[name = string("op_270"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_1_cast_fp16 = reshape(shape = var_270, x = key_3_cast_fp16)[name = string("key_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_272 = const()[name = string("op_272"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_1_cast_fp16 = reshape(shape = var_272, x = value_1_cast_fp16)[name = string("value_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = string("op_275_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = string("op_275_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = string("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = string("op_279_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = string("op_279_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = string("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_279_cast_fp16")];
+            tensor<int32, [4]> var_291_begin_0 = const()[name = string("op_291_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_291_end_0 = const()[name = string("op_291_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_291_end_mask_0 = const()[name = string("op_291_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_291_cast_fp16 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_291_cast_fp16")];
+            tensor<int32, [4]> var_295_begin_0 = const()[name = string("op_295_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_295_end_0 = const()[name = string("op_295_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_295_end_mask_0 = const()[name = string("op_295_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_295_cast_fp16 = slice_by_index(begin = var_295_begin_0, end = var_295_end_0, end_mask = var_295_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_295_cast_fp16")];
+            tensor<int32, [4]> var_307_begin_0 = const()[name = string("op_307_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_307_end_0 = const()[name = string("op_307_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_307_end_mask_0 = const()[name = string("op_307_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_307_cast_fp16 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_307_cast_fp16")];
+            tensor<int32, [4]> var_311_begin_0 = const()[name = string("op_311_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_311_end_0 = const()[name = string("op_311_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_311_end_mask_0 = const()[name = string("op_311_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_311_cast_fp16 = slice_by_index(begin = var_311_begin_0, end = var_311_end_0, end_mask = var_311_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_311_cast_fp16")];
+            tensor<int32, [4]> var_323_begin_0 = const()[name = string("op_323_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_323_end_0 = const()[name = string("op_323_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_323_end_mask_0 = const()[name = string("op_323_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_323_cast_fp16 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_323_cast_fp16")];
+            tensor<int32, [4]> var_327_begin_0 = const()[name = string("op_327_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_327_end_0 = const()[name = string("op_327_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_327_end_mask_0 = const()[name = string("op_327_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_327_cast_fp16 = slice_by_index(begin = var_327_begin_0, end = var_327_end_0, end_mask = var_327_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_327_cast_fp16")];
+            tensor<int32, [4]> var_339_begin_0 = const()[name = string("op_339_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_339_end_0 = const()[name = string("op_339_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_339_end_mask_0 = const()[name = string("op_339_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_339_cast_fp16 = slice_by_index(begin = var_339_begin_0, end = var_339_end_0, end_mask = var_339_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_339_cast_fp16")];
+            tensor<int32, [4]> var_343_begin_0 = const()[name = string("op_343_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_343_end_0 = const()[name = string("op_343_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_343_end_mask_0 = const()[name = string("op_343_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_343_cast_fp16 = slice_by_index(begin = var_343_begin_0, end = var_343_end_0, end_mask = var_343_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_343_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_355_cast_fp16")];
+            tensor<int32, [4]> var_359_begin_0 = const()[name = string("op_359_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_359_end_0 = const()[name = string("op_359_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_359_end_mask_0 = const()[name = string("op_359_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_359_cast_fp16 = slice_by_index(begin = var_359_begin_0, end = var_359_end_0, end_mask = var_359_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_359_cast_fp16")];
+            tensor<int32, [4]> var_371_begin_0 = const()[name = string("op_371_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_371_end_0 = const()[name = string("op_371_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_371_end_mask_0 = const()[name = string("op_371_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_371_cast_fp16 = slice_by_index(begin = var_371_begin_0, end = var_371_end_0, end_mask = var_371_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_371_cast_fp16")];
+            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_387_cast_fp16 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_387_cast_fp16")];
+            tensor<int32, [4]> var_391_begin_0 = const()[name = string("op_391_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_391_end_0 = const()[name = string("op_391_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_391_end_mask_0 = const()[name = string("op_391_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_391_cast_fp16 = slice_by_index(begin = var_391_begin_0, end = var_391_end_0, end_mask = var_391_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_391_cast_fp16")];
+            bool key_heads_3_interleave_0 = const()[name = string("key_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_3_cast_fp16 = concat(axis = var_117, interleave = key_heads_3_interleave_0, values = (var_275_cast_fp16, var_275_cast_fp16, var_291_cast_fp16, var_291_cast_fp16, var_307_cast_fp16, var_307_cast_fp16, var_323_cast_fp16, var_323_cast_fp16, var_339_cast_fp16, var_339_cast_fp16, var_355_cast_fp16, var_355_cast_fp16, var_371_cast_fp16, var_371_cast_fp16, var_387_cast_fp16, var_387_cast_fp16))[name = string("key_heads_3_cast_fp16")];
+            bool value_heads_3_interleave_0 = const()[name = string("value_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_3_cast_fp16 = concat(axis = var_117, interleave = value_heads_3_interleave_0, values = (var_279_cast_fp16, var_279_cast_fp16, var_295_cast_fp16, var_295_cast_fp16, var_311_cast_fp16, var_311_cast_fp16, var_327_cast_fp16, var_327_cast_fp16, var_343_cast_fp16, var_343_cast_fp16, var_359_cast_fp16, var_359_cast_fp16, var_375_cast_fp16, var_375_cast_fp16, var_391_cast_fp16, var_391_cast_fp16))[name = string("value_heads_3_cast_fp16")];
+            fp16 var_414_to_fp16 = const()[name = string("op_414_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_415_cast_fp16 = mul(x = mh_q_3_cast_fp16, y = var_414_to_fp16)[name = string("op_415_cast_fp16")];
+            bool mh_w_1_transpose_x_0 = const()[name = string("mh_w_1_transpose_x_0"), val = bool(true)];
+            bool mh_w_1_transpose_y_0 = const()[name = string("mh_w_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_1_cast_fp16 = matmul(transpose_x = mh_w_1_transpose_x_0, transpose_y = mh_w_1_transpose_y_0, x = var_415_cast_fp16, y = key_heads_3_cast_fp16)[name = string("mh_w_1_cast_fp16")];
+            tensor<int32, [1]> var_423_axes_0 = const()[name = string("op_423_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_423_cast_fp16 = expand_dims(axes = var_423_axes_0, x = key_padding_mask)[name = string("op_423_cast_fp16")];
+            tensor<int32, [1]> var_424_axes_0 = const()[name = string("op_424_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_424_cast_fp16 = expand_dims(axes = var_424_axes_0, x = var_423_cast_fp16)[name = string("op_424_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_3_cast_fp16 = add(x = mh_w_1_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_3_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_427_cast_fp16 = softmax(axis = var_99, x = mh_w_3_cast_fp16)[name = string("op_427_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = value_heads_3_cast_fp16, y = var_427_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_432 = const()[name = string("op_432"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = reshape(shape = var_432, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            string obj_11_pad_type_0 = const()[name = string("obj_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_11_strides_0 = const()[name = string("obj_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_11_pad_0 = const()[name = string("obj_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_11_dilations_0 = const()[name = string("obj_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_11_groups_0 = const()[name = string("obj_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4213632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6310848))))[name = string("layers_0_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_11_dilations_0, groups = obj_11_groups_0, pad = obj_11_pad_0, pad_type = obj_11_pad_type_0, strides = obj_11_strides_0, weight = layers_0_self_attn_o_proj_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("obj_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_5_cast_fp16 = add(x = input_embeds, y = obj_11_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_451_cast_fp16 = add(x = variance_7_cast_fp16, y = var_450_to_fp16)[name = string("op_451_cast_fp16")];
+            fp32 var_452_epsilon_0 = const()[name = string("op_452_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_452_cast_fp16 = rsqrt(epsilon = var_452_epsilon_0, x = var_451_cast_fp16)[name = string("op_452_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_452_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6311424)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_3_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_3_cast_fp16")];
+            string input_5_pad_type_0 = const()[name = string("input_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_5_strides_0 = const()[name = string("input_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_5_pad_0 = const()[name = string("input_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_5_dilations_0 = const()[name = string("input_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_5_groups_0 = const()[name = string("input_5_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6313536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9459328))))[name = string("layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_5_cast_fp16 = conv(dilations = input_5_dilations_0, groups = input_5_groups_0, pad = input_5_pad_0, pad_type = input_5_pad_type_0, strides = input_5_strides_0, weight = layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_466_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_466_cast_fp16")];
+            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_472_strides_0 = const()[name = string("op_472_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_472_dilations_0 = const()[name = string("op_472_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_472_groups_0 = const()[name = string("op_472_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9459904))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12605696))))[name = string("layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_472_cast_fp16 = conv(dilations = var_472_dilations_0, groups = var_472_groups_0, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_472_strides_0, weight = layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_7_cast_fp16 = mul(x = var_466_cast_fp16, y = var_472_cast_fp16)[name = string("input_7_cast_fp16")];
+            string hidden_states_9_pad_type_0 = const()[name = string("hidden_states_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_9_strides_0 = const()[name = string("hidden_states_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_9_pad_0 = const()[name = string("hidden_states_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_9_dilations_0 = const()[name = string("hidden_states_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_9_groups_0 = const()[name = string("hidden_states_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_0_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12606272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15752064))))[name = string("layers_0_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_9_cast_fp16 = conv(dilations = hidden_states_9_dilations_0, groups = hidden_states_9_groups_0, pad = hidden_states_9_pad_0, pad_type = hidden_states_9_pad_type_0, strides = hidden_states_9_strides_0, weight = layers_0_mlp_down_proj_weight_to_fp16_palettized, x = input_7_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            int32 var_486 = const()[name = string("op_486"), val = int32(3)];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_516_to_fp16 = const()[name = string("op_516_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_517_cast_fp16 = add(x = variance_9_cast_fp16, y = var_516_to_fp16)[name = string("op_517_cast_fp16")];
+            fp32 var_518_epsilon_0 = const()[name = string("op_518_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_518_cast_fp16 = rsqrt(epsilon = var_518_epsilon_0, x = var_517_cast_fp16)[name = string("op_518_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_518_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15752640)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_13_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_11_cast_fp16)[name = string("obj_13_cast_fp16")];
+            string query_7_pad_type_0 = const()[name = string("query_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_7_strides_0 = const()[name = string("query_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_7_pad_0 = const()[name = string("query_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_7_dilations_0 = const()[name = string("query_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_7_groups_0 = const()[name = string("query_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15754752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17851968))))[name = string("layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_7_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_7_dilations_0, groups = query_7_groups_0, pad = query_7_pad_0, pad_type = query_7_pad_type_0, strides = query_7_strides_0, weight = layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("query_7_cast_fp16")];
+            string current_key_5_pad_type_0 = const()[name = string("current_key_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_5_strides_0 = const()[name = string("current_key_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_5_pad_0 = const()[name = string("current_key_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_5_dilations_0 = const()[name = string("current_key_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_5_groups_0 = const()[name = string("current_key_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17852544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18901184))))[name = string("layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = conv(dilations = current_key_5_dilations_0, groups = current_key_5_groups_0, pad = current_key_5_pad_0, pad_type = current_key_5_pad_type_0, strides = current_key_5_strides_0, weight = layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            string current_value_3_pad_type_0 = const()[name = string("current_value_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_3_strides_0 = const()[name = string("current_value_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_3_pad_0 = const()[name = string("current_value_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_3_dilations_0 = const()[name = string("current_value_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_3_groups_0 = const()[name = string("current_value_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18901760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19950400))))[name = string("layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_3_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_3_dilations_0, groups = current_value_3_groups_0, pad = current_value_3_pad_0, pad_type = current_value_3_pad_type_0, strides = current_value_3_strides_0, weight = layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_value_3_cast_fp16")];
+            tensor<int32, [4]> var_555 = const()[name = string("op_555"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_9_cast_fp16 = reshape(shape = var_555, x = query_7_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_561_to_fp16 = const()[name = string("op_561_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_562_cast_fp16 = add(x = variance_11_cast_fp16, y = var_561_to_fp16)[name = string("op_562_cast_fp16")];
+            fp32 var_563_epsilon_0 = const()[name = string("op_563_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_563_cast_fp16 = rsqrt(epsilon = var_563_epsilon_0, x = var_562_cast_fp16)[name = string("op_563_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_563_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19950976)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_3_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_13_cast_fp16)[name = string("query_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_11_cast_fp16 = reshape(shape = var_571, x = current_key_5_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_577_to_fp16 = const()[name = string("op_577_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_578_cast_fp16 = add(x = variance_13_cast_fp16, y = var_577_to_fp16)[name = string("op_578_cast_fp16")];
+            fp32 var_579_epsilon_0 = const()[name = string("op_579_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_579_cast_fp16 = rsqrt(epsilon = var_579_epsilon_0, x = var_578_cast_fp16)[name = string("op_579_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_579_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19951296)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_3_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_15_cast_fp16)[name = string("current_key_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_597 = const()[name = string("op_597"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_7_cast_fp16 = reshape(shape = var_597, x = query_normed_3_cast_fp16)[name = string("mh_q_7_cast_fp16")];
+            tensor<int32, [4]> var_599 = const()[name = string("op_599"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_5_cast_fp16 = reshape(shape = var_599, x = current_key_normed_3_cast_fp16)[name = string("mh_k_5_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_603_cast_fp16 = mul(x = mh_q_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_603_cast_fp16")];
+            tensor<int32, [4]> var_608_begin_0 = const()[name = string("op_608_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_608_end_0 = const()[name = string("op_608_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_608_end_mask_0 = const()[name = string("op_608_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_608_cast_fp16 = slice_by_index(begin = var_608_begin_0, end = var_608_end_0, end_mask = var_608_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_608_cast_fp16")];
+            tensor<int32, [4]> var_614_begin_0 = const()[name = string("op_614_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_614_end_0 = const()[name = string("op_614_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_614_end_mask_0 = const()[name = string("op_614_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_614_cast_fp16 = slice_by_index(begin = var_614_begin_0, end = var_614_end_0, end_mask = var_614_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_614_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_616_cast_fp16 = mul(x = var_614_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_616_cast_fp16")];
+            bool var_618_interleave_0 = const()[name = string("op_618_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_618_cast_fp16 = concat(axis = var_496, interleave = var_618_interleave_0, values = (var_616_cast_fp16, var_608_cast_fp16))[name = string("op_618_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_619_cast_fp16 = mul(x = var_618_cast_fp16, y = sin_1_cast_fp16)[name = string("op_619_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_9_cast_fp16 = add(x = var_603_cast_fp16, y = var_619_cast_fp16)[name = string("mh_q_9_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_621_cast_fp16 = mul(x = mh_k_5_cast_fp16, y = cos_1_cast_fp16)[name = string("op_621_cast_fp16")];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_626_cast_fp16 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_632_begin_0 = const()[name = string("op_632_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_632_end_0 = const()[name = string("op_632_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_632_end_mask_0 = const()[name = string("op_632_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_632_cast_fp16 = slice_by_index(begin = var_632_begin_0, end = var_632_end_0, end_mask = var_632_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_632_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_634_cast_fp16 = mul(x = var_632_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_634_cast_fp16")];
+            bool var_636_interleave_0 = const()[name = string("op_636_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_636_cast_fp16 = concat(axis = var_496, interleave = var_636_interleave_0, values = (var_634_cast_fp16, var_626_cast_fp16))[name = string("op_636_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_637_cast_fp16 = mul(x = var_636_cast_fp16, y = sin_1_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_7_cast_fp16 = add(x = var_621_cast_fp16, y = var_637_cast_fp16)[name = string("mh_k_7_cast_fp16")];
+            tensor<int32, [4]> var_641 = const()[name = string("op_641"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_641, x = mh_k_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_648_cast_fp16 = mul(x = var_84_cast_fp16_1, y = var_260_cast_fp16)[name = string("op_648_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_649_cast_fp16 = mul(x = current_key_7_cast_fp16, y = var_258_cast_fp16)[name = string("op_649_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_9_cast_fp16 = add(x = var_648_cast_fp16, y = var_649_cast_fp16)[name = string("key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_652_cast_fp16 = mul(x = var_92_cast_fp16_1, y = var_260_cast_fp16)[name = string("op_652_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_653_cast_fp16 = mul(x = current_value_3_cast_fp16, y = var_258_cast_fp16)[name = string("op_653_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_5_cast_fp16 = add(x = var_652_cast_fp16, y = var_653_cast_fp16)[name = string("value_5_cast_fp16")];
+            tensor<int32, [4]> var_657 = const()[name = string("op_657"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_5_cast_fp16 = reshape(shape = var_657, x = key_9_cast_fp16)[name = string("key_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_659 = const()[name = string("op_659"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_5_cast_fp16 = reshape(shape = var_659, x = value_5_cast_fp16)[name = string("value_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_662_begin_0 = const()[name = string("op_662_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_662_end_0 = const()[name = string("op_662_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_662_end_mask_0 = const()[name = string("op_662_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_662_cast_fp16 = slice_by_index(begin = var_662_begin_0, end = var_662_end_0, end_mask = var_662_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_662_cast_fp16")];
+            tensor<int32, [4]> var_666_begin_0 = const()[name = string("op_666_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_666_end_0 = const()[name = string("op_666_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_666_end_mask_0 = const()[name = string("op_666_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_666_cast_fp16 = slice_by_index(begin = var_666_begin_0, end = var_666_end_0, end_mask = var_666_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_666_cast_fp16")];
+            tensor<int32, [4]> var_678_begin_0 = const()[name = string("op_678_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_678_end_0 = const()[name = string("op_678_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_678_end_mask_0 = const()[name = string("op_678_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_678_cast_fp16 = slice_by_index(begin = var_678_begin_0, end = var_678_end_0, end_mask = var_678_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_678_cast_fp16")];
+            tensor<int32, [4]> var_682_begin_0 = const()[name = string("op_682_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_682_end_0 = const()[name = string("op_682_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_682_end_mask_0 = const()[name = string("op_682_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_682_cast_fp16 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_682_cast_fp16")];
+            tensor<int32, [4]> var_694_begin_0 = const()[name = string("op_694_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_694_end_0 = const()[name = string("op_694_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_694_end_mask_0 = const()[name = string("op_694_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_694_cast_fp16 = slice_by_index(begin = var_694_begin_0, end = var_694_end_0, end_mask = var_694_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_694_cast_fp16")];
+            tensor<int32, [4]> var_698_begin_0 = const()[name = string("op_698_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_698_end_0 = const()[name = string("op_698_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_698_end_mask_0 = const()[name = string("op_698_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_698_cast_fp16 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_698_cast_fp16")];
+            tensor<int32, [4]> var_710_begin_0 = const()[name = string("op_710_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_710_end_0 = const()[name = string("op_710_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_710_end_mask_0 = const()[name = string("op_710_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_710_cast_fp16 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_710_cast_fp16")];
+            tensor<int32, [4]> var_714_begin_0 = const()[name = string("op_714_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_714_end_0 = const()[name = string("op_714_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_714_end_mask_0 = const()[name = string("op_714_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_714_cast_fp16 = slice_by_index(begin = var_714_begin_0, end = var_714_end_0, end_mask = var_714_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<int32, [4]> var_726_begin_0 = const()[name = string("op_726_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_726_end_0 = const()[name = string("op_726_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_726_end_mask_0 = const()[name = string("op_726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_726_cast_fp16 = slice_by_index(begin = var_726_begin_0, end = var_726_end_0, end_mask = var_726_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_726_cast_fp16")];
+            tensor<int32, [4]> var_730_begin_0 = const()[name = string("op_730_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_730_end_0 = const()[name = string("op_730_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_730_end_mask_0 = const()[name = string("op_730_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_730_cast_fp16 = slice_by_index(begin = var_730_begin_0, end = var_730_end_0, end_mask = var_730_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_730_cast_fp16")];
+            tensor<int32, [4]> var_742_begin_0 = const()[name = string("op_742_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_742_end_0 = const()[name = string("op_742_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_742_end_mask_0 = const()[name = string("op_742_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_742_cast_fp16 = slice_by_index(begin = var_742_begin_0, end = var_742_end_0, end_mask = var_742_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_742_cast_fp16")];
+            tensor<int32, [4]> var_746_begin_0 = const()[name = string("op_746_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_746_end_0 = const()[name = string("op_746_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_746_end_mask_0 = const()[name = string("op_746_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_746_cast_fp16 = slice_by_index(begin = var_746_begin_0, end = var_746_end_0, end_mask = var_746_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_746_cast_fp16")];
+            tensor<int32, [4]> var_758_begin_0 = const()[name = string("op_758_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_758_end_0 = const()[name = string("op_758_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_758_end_mask_0 = const()[name = string("op_758_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_758_cast_fp16 = slice_by_index(begin = var_758_begin_0, end = var_758_end_0, end_mask = var_758_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_758_cast_fp16")];
+            tensor<int32, [4]> var_762_begin_0 = const()[name = string("op_762_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_762_end_0 = const()[name = string("op_762_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_762_end_mask_0 = const()[name = string("op_762_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_762_cast_fp16 = slice_by_index(begin = var_762_begin_0, end = var_762_end_0, end_mask = var_762_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_762_cast_fp16")];
+            tensor<int32, [4]> var_774_begin_0 = const()[name = string("op_774_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_774_end_0 = const()[name = string("op_774_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_774_end_mask_0 = const()[name = string("op_774_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_774_cast_fp16 = slice_by_index(begin = var_774_begin_0, end = var_774_end_0, end_mask = var_774_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_774_cast_fp16")];
+            tensor<int32, [4]> var_778_begin_0 = const()[name = string("op_778_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_778_end_0 = const()[name = string("op_778_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_778_end_mask_0 = const()[name = string("op_778_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_778_cast_fp16 = slice_by_index(begin = var_778_begin_0, end = var_778_end_0, end_mask = var_778_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_778_cast_fp16")];
+            bool key_heads_7_interleave_0 = const()[name = string("key_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_7_cast_fp16 = concat(axis = var_504, interleave = key_heads_7_interleave_0, values = (var_662_cast_fp16, var_662_cast_fp16, var_678_cast_fp16, var_678_cast_fp16, var_694_cast_fp16, var_694_cast_fp16, var_710_cast_fp16, var_710_cast_fp16, var_726_cast_fp16, var_726_cast_fp16, var_742_cast_fp16, var_742_cast_fp16, var_758_cast_fp16, var_758_cast_fp16, var_774_cast_fp16, var_774_cast_fp16))[name = string("key_heads_7_cast_fp16")];
+            bool value_heads_7_interleave_0 = const()[name = string("value_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_7_cast_fp16 = concat(axis = var_504, interleave = value_heads_7_interleave_0, values = (var_666_cast_fp16, var_666_cast_fp16, var_682_cast_fp16, var_682_cast_fp16, var_698_cast_fp16, var_698_cast_fp16, var_714_cast_fp16, var_714_cast_fp16, var_730_cast_fp16, var_730_cast_fp16, var_746_cast_fp16, var_746_cast_fp16, var_762_cast_fp16, var_762_cast_fp16, var_778_cast_fp16, var_778_cast_fp16))[name = string("value_heads_7_cast_fp16")];
+            fp16 var_801_to_fp16 = const()[name = string("op_801_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_802_cast_fp16 = mul(x = mh_q_9_cast_fp16, y = var_801_to_fp16)[name = string("op_802_cast_fp16")];
+            bool mh_w_5_transpose_x_0 = const()[name = string("mh_w_5_transpose_x_0"), val = bool(true)];
+            bool mh_w_5_transpose_y_0 = const()[name = string("mh_w_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_5_cast_fp16 = matmul(transpose_x = mh_w_5_transpose_x_0, transpose_y = mh_w_5_transpose_y_0, x = var_802_cast_fp16, y = key_heads_7_cast_fp16)[name = string("mh_w_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_7_cast_fp16 = add(x = mh_w_5_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_7_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_814_cast_fp16 = softmax(axis = var_486, x = mh_w_7_cast_fp16)[name = string("op_814_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = value_heads_7_cast_fp16, y = var_814_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_819 = const()[name = string("op_819"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_9_cast_fp16 = reshape(shape = var_819, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            string obj_19_pad_type_0 = const()[name = string("obj_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_19_strides_0 = const()[name = string("obj_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_19_pad_0 = const()[name = string("obj_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_19_dilations_0 = const()[name = string("obj_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_19_groups_0 = const()[name = string("obj_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19951616))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22048832))))[name = string("layers_1_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_19_dilations_0, groups = obj_19_groups_0, pad = obj_19_pad_0, pad_type = obj_19_pad_type_0, strides = obj_19_strides_0, weight = layers_1_self_attn_o_proj_weight_to_fp16_palettized, x = input_9_cast_fp16)[name = string("obj_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_13_cast_fp16 = add(x = inputs_7_cast_fp16, y = obj_19_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_837_to_fp16 = const()[name = string("op_837_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_838_cast_fp16 = add(x = variance_15_cast_fp16, y = var_837_to_fp16)[name = string("op_838_cast_fp16")];
+            fp32 var_839_epsilon_0 = const()[name = string("op_839_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_839_cast_fp16 = rsqrt(epsilon = var_839_epsilon_0, x = var_838_cast_fp16)[name = string("op_839_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_839_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22049408)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_11_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_17_cast_fp16)[name = string("input_11_cast_fp16")];
+            string input_13_pad_type_0 = const()[name = string("input_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_13_strides_0 = const()[name = string("input_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_13_pad_0 = const()[name = string("input_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_13_dilations_0 = const()[name = string("input_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_13_groups_0 = const()[name = string("input_13_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22051520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25197312))))[name = string("layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_13_cast_fp16 = conv(dilations = input_13_dilations_0, groups = input_13_groups_0, pad = input_13_pad_0, pad_type = input_13_pad_type_0, strides = input_13_strides_0, weight = layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_853_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_853_cast_fp16")];
+            string var_859_pad_type_0 = const()[name = string("op_859_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_859_strides_0 = const()[name = string("op_859_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_859_pad_0 = const()[name = string("op_859_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_859_dilations_0 = const()[name = string("op_859_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_859_groups_0 = const()[name = string("op_859_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25197888))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28343680))))[name = string("layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_859_cast_fp16 = conv(dilations = var_859_dilations_0, groups = var_859_groups_0, pad = var_859_pad_0, pad_type = var_859_pad_type_0, strides = var_859_strides_0, weight = layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("op_859_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_15_cast_fp16 = mul(x = var_853_cast_fp16, y = var_859_cast_fp16)[name = string("input_15_cast_fp16")];
+            string hidden_states_19_pad_type_0 = const()[name = string("hidden_states_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_19_strides_0 = const()[name = string("hidden_states_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_19_pad_0 = const()[name = string("hidden_states_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_19_dilations_0 = const()[name = string("hidden_states_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_19_groups_0 = const()[name = string("hidden_states_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_1_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28344256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31490048))))[name = string("layers_1_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_19_cast_fp16 = conv(dilations = hidden_states_19_dilations_0, groups = hidden_states_19_groups_0, pad = hidden_states_19_pad_0, pad_type = hidden_states_19_pad_type_0, strides = hidden_states_19_strides_0, weight = layers_1_mlp_down_proj_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            int32 var_873 = const()[name = string("op_873"), val = int32(3)];
+            int32 var_883 = const()[name = string("op_883"), val = int32(-2)];
+            int32 var_891 = const()[name = string("op_891"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_903_to_fp16 = const()[name = string("op_903_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_904_cast_fp16 = add(x = variance_17_cast_fp16, y = var_903_to_fp16)[name = string("op_904_cast_fp16")];
+            fp32 var_905_epsilon_0 = const()[name = string("op_905_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_905_cast_fp16 = rsqrt(epsilon = var_905_epsilon_0, x = var_904_cast_fp16)[name = string("op_905_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_905_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31490624)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_21_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_21_cast_fp16)[name = string("obj_21_cast_fp16")];
+            string query_13_pad_type_0 = const()[name = string("query_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_13_strides_0 = const()[name = string("query_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_13_pad_0 = const()[name = string("query_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_13_dilations_0 = const()[name = string("query_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_13_groups_0 = const()[name = string("query_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31492736))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33589952))))[name = string("layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_13_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_13_dilations_0, groups = query_13_groups_0, pad = query_13_pad_0, pad_type = query_13_pad_type_0, strides = query_13_strides_0, weight = layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("query_13_cast_fp16")];
+            string current_key_9_pad_type_0 = const()[name = string("current_key_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_9_strides_0 = const()[name = string("current_key_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_9_pad_0 = const()[name = string("current_key_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_9_dilations_0 = const()[name = string("current_key_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_9_groups_0 = const()[name = string("current_key_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33590528))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34639168))))[name = string("layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = conv(dilations = current_key_9_dilations_0, groups = current_key_9_groups_0, pad = current_key_9_pad_0, pad_type = current_key_9_pad_type_0, strides = current_key_9_strides_0, weight = layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            string current_value_5_pad_type_0 = const()[name = string("current_value_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_5_strides_0 = const()[name = string("current_value_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_5_pad_0 = const()[name = string("current_value_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_5_dilations_0 = const()[name = string("current_value_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_5_groups_0 = const()[name = string("current_value_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34639744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35688384))))[name = string("layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_5_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_5_dilations_0, groups = current_value_5_groups_0, pad = current_value_5_pad_0, pad_type = current_value_5_pad_type_0, strides = current_value_5_strides_0, weight = layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_value_5_cast_fp16")];
+            tensor<int32, [4]> var_942 = const()[name = string("op_942"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_17_cast_fp16 = reshape(shape = var_942, x = query_13_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_948_to_fp16 = const()[name = string("op_948_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_949_cast_fp16 = add(x = variance_19_cast_fp16, y = var_948_to_fp16)[name = string("op_949_cast_fp16")];
+            fp32 var_950_epsilon_0 = const()[name = string("op_950_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_950_cast_fp16 = rsqrt(epsilon = var_950_epsilon_0, x = var_949_cast_fp16)[name = string("op_950_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_950_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35688960)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_5_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_23_cast_fp16)[name = string("query_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_958 = const()[name = string("op_958"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_19_cast_fp16 = reshape(shape = var_958, x = current_key_9_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_964_to_fp16 = const()[name = string("op_964_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_965_cast_fp16 = add(x = variance_21_cast_fp16, y = var_964_to_fp16)[name = string("op_965_cast_fp16")];
+            fp32 var_966_epsilon_0 = const()[name = string("op_966_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_966_cast_fp16 = rsqrt(epsilon = var_966_epsilon_0, x = var_965_cast_fp16)[name = string("op_966_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_966_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35689280)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_5_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_25_cast_fp16)[name = string("current_key_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_984 = const()[name = string("op_984"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_13_cast_fp16 = reshape(shape = var_984, x = query_normed_5_cast_fp16)[name = string("mh_q_13_cast_fp16")];
+            tensor<int32, [4]> var_986 = const()[name = string("op_986"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_9_cast_fp16 = reshape(shape = var_986, x = current_key_normed_5_cast_fp16)[name = string("mh_k_9_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_990_cast_fp16 = mul(x = mh_q_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_990_cast_fp16")];
+            tensor<int32, [4]> var_995_begin_0 = const()[name = string("op_995_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_995_end_0 = const()[name = string("op_995_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_995_end_mask_0 = const()[name = string("op_995_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_995_cast_fp16 = slice_by_index(begin = var_995_begin_0, end = var_995_end_0, end_mask = var_995_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_995_cast_fp16")];
+            tensor<int32, [4]> var_1001_begin_0 = const()[name = string("op_1001_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1001_end_0 = const()[name = string("op_1001_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1001_end_mask_0 = const()[name = string("op_1001_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1001_cast_fp16 = slice_by_index(begin = var_1001_begin_0, end = var_1001_end_0, end_mask = var_1001_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1001_cast_fp16")];
+            fp16 const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1003_cast_fp16 = mul(x = var_1001_cast_fp16, y = const_63_promoted_to_fp16)[name = string("op_1003_cast_fp16")];
+            bool var_1005_interleave_0 = const()[name = string("op_1005_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1005_cast_fp16 = concat(axis = var_883, interleave = var_1005_interleave_0, values = (var_1003_cast_fp16, var_995_cast_fp16))[name = string("op_1005_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1006_cast_fp16 = mul(x = var_1005_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1006_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_15_cast_fp16 = add(x = var_990_cast_fp16, y = var_1006_cast_fp16)[name = string("mh_q_15_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1008_cast_fp16 = mul(x = mh_k_9_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1013_begin_0 = const()[name = string("op_1013_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1013_end_0 = const()[name = string("op_1013_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1013_end_mask_0 = const()[name = string("op_1013_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1013_cast_fp16 = slice_by_index(begin = var_1013_begin_0, end = var_1013_end_0, end_mask = var_1013_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1013_cast_fp16")];
+            tensor<int32, [4]> var_1019_begin_0 = const()[name = string("op_1019_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1019_end_0 = const()[name = string("op_1019_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1019_end_mask_0 = const()[name = string("op_1019_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1019_cast_fp16 = slice_by_index(begin = var_1019_begin_0, end = var_1019_end_0, end_mask = var_1019_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1019_cast_fp16")];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1021_cast_fp16 = mul(x = var_1019_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_1021_cast_fp16")];
+            bool var_1023_interleave_0 = const()[name = string("op_1023_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1023_cast_fp16 = concat(axis = var_883, interleave = var_1023_interleave_0, values = (var_1021_cast_fp16, var_1013_cast_fp16))[name = string("op_1023_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1024_cast_fp16 = mul(x = var_1023_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1024_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_11_cast_fp16 = add(x = var_1008_cast_fp16, y = var_1024_cast_fp16)[name = string("mh_k_11_cast_fp16")];
+            tensor<int32, [4]> var_1028 = const()[name = string("op_1028"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1028, x = mh_k_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1035_cast_fp16 = mul(x = var_84_cast_fp16_2, y = var_260_cast_fp16)[name = string("op_1035_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1036_cast_fp16 = mul(x = current_key_11_cast_fp16, y = var_258_cast_fp16)[name = string("op_1036_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_15_cast_fp16 = add(x = var_1035_cast_fp16, y = var_1036_cast_fp16)[name = string("key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1039_cast_fp16 = mul(x = var_92_cast_fp16_2, y = var_260_cast_fp16)[name = string("op_1039_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1040_cast_fp16 = mul(x = current_value_5_cast_fp16, y = var_258_cast_fp16)[name = string("op_1040_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_9_cast_fp16 = add(x = var_1039_cast_fp16, y = var_1040_cast_fp16)[name = string("value_9_cast_fp16")];
+            tensor<int32, [4]> var_1044 = const()[name = string("op_1044"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_9_cast_fp16 = reshape(shape = var_1044, x = key_15_cast_fp16)[name = string("key_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1046 = const()[name = string("op_1046"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_9_cast_fp16 = reshape(shape = var_1046, x = value_9_cast_fp16)[name = string("value_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1049_begin_0 = const()[name = string("op_1049_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1049_end_0 = const()[name = string("op_1049_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1049_end_mask_0 = const()[name = string("op_1049_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1049_cast_fp16 = slice_by_index(begin = var_1049_begin_0, end = var_1049_end_0, end_mask = var_1049_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1049_cast_fp16")];
+            tensor<int32, [4]> var_1053_begin_0 = const()[name = string("op_1053_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1053_end_0 = const()[name = string("op_1053_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1053_end_mask_0 = const()[name = string("op_1053_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1053_cast_fp16 = slice_by_index(begin = var_1053_begin_0, end = var_1053_end_0, end_mask = var_1053_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1053_cast_fp16")];
+            tensor<int32, [4]> var_1065_begin_0 = const()[name = string("op_1065_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1065_end_0 = const()[name = string("op_1065_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1065_end_mask_0 = const()[name = string("op_1065_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1065_cast_fp16 = slice_by_index(begin = var_1065_begin_0, end = var_1065_end_0, end_mask = var_1065_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1065_cast_fp16")];
+            tensor<int32, [4]> var_1069_begin_0 = const()[name = string("op_1069_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1069_end_0 = const()[name = string("op_1069_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1069_end_mask_0 = const()[name = string("op_1069_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1069_cast_fp16 = slice_by_index(begin = var_1069_begin_0, end = var_1069_end_0, end_mask = var_1069_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1069_cast_fp16")];
+            tensor<int32, [4]> var_1081_begin_0 = const()[name = string("op_1081_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1081_end_0 = const()[name = string("op_1081_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1081_end_mask_0 = const()[name = string("op_1081_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1081_cast_fp16 = slice_by_index(begin = var_1081_begin_0, end = var_1081_end_0, end_mask = var_1081_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1081_cast_fp16")];
+            tensor<int32, [4]> var_1085_begin_0 = const()[name = string("op_1085_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1085_end_0 = const()[name = string("op_1085_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1085_end_mask_0 = const()[name = string("op_1085_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1085_cast_fp16 = slice_by_index(begin = var_1085_begin_0, end = var_1085_end_0, end_mask = var_1085_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1085_cast_fp16")];
+            tensor<int32, [4]> var_1097_begin_0 = const()[name = string("op_1097_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1097_end_0 = const()[name = string("op_1097_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1097_end_mask_0 = const()[name = string("op_1097_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1097_cast_fp16 = slice_by_index(begin = var_1097_begin_0, end = var_1097_end_0, end_mask = var_1097_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1097_cast_fp16")];
+            tensor<int32, [4]> var_1101_begin_0 = const()[name = string("op_1101_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1101_end_0 = const()[name = string("op_1101_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1101_end_mask_0 = const()[name = string("op_1101_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1101_cast_fp16 = slice_by_index(begin = var_1101_begin_0, end = var_1101_end_0, end_mask = var_1101_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1101_cast_fp16")];
+            tensor<int32, [4]> var_1113_begin_0 = const()[name = string("op_1113_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1113_end_0 = const()[name = string("op_1113_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1113_end_mask_0 = const()[name = string("op_1113_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1113_cast_fp16 = slice_by_index(begin = var_1113_begin_0, end = var_1113_end_0, end_mask = var_1113_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1113_cast_fp16")];
+            tensor<int32, [4]> var_1117_begin_0 = const()[name = string("op_1117_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1117_end_0 = const()[name = string("op_1117_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1117_end_mask_0 = const()[name = string("op_1117_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1117_cast_fp16 = slice_by_index(begin = var_1117_begin_0, end = var_1117_end_0, end_mask = var_1117_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1117_cast_fp16")];
+            tensor<int32, [4]> var_1129_begin_0 = const()[name = string("op_1129_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1129_end_0 = const()[name = string("op_1129_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1129_end_mask_0 = const()[name = string("op_1129_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1129_cast_fp16 = slice_by_index(begin = var_1129_begin_0, end = var_1129_end_0, end_mask = var_1129_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1129_cast_fp16")];
+            tensor<int32, [4]> var_1133_begin_0 = const()[name = string("op_1133_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1133_end_0 = const()[name = string("op_1133_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1133_end_mask_0 = const()[name = string("op_1133_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1133_cast_fp16 = slice_by_index(begin = var_1133_begin_0, end = var_1133_end_0, end_mask = var_1133_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1133_cast_fp16")];
+            tensor<int32, [4]> var_1145_begin_0 = const()[name = string("op_1145_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1145_end_0 = const()[name = string("op_1145_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1145_end_mask_0 = const()[name = string("op_1145_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1145_cast_fp16 = slice_by_index(begin = var_1145_begin_0, end = var_1145_end_0, end_mask = var_1145_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1145_cast_fp16")];
+            tensor<int32, [4]> var_1149_begin_0 = const()[name = string("op_1149_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1149_end_0 = const()[name = string("op_1149_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1149_end_mask_0 = const()[name = string("op_1149_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1149_cast_fp16 = slice_by_index(begin = var_1149_begin_0, end = var_1149_end_0, end_mask = var_1149_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1149_cast_fp16")];
+            tensor<int32, [4]> var_1161_begin_0 = const()[name = string("op_1161_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1161_end_0 = const()[name = string("op_1161_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1161_end_mask_0 = const()[name = string("op_1161_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1161_cast_fp16 = slice_by_index(begin = var_1161_begin_0, end = var_1161_end_0, end_mask = var_1161_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1161_cast_fp16")];
+            tensor<int32, [4]> var_1165_begin_0 = const()[name = string("op_1165_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1165_end_0 = const()[name = string("op_1165_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1165_end_mask_0 = const()[name = string("op_1165_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1165_cast_fp16 = slice_by_index(begin = var_1165_begin_0, end = var_1165_end_0, end_mask = var_1165_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1165_cast_fp16")];
+            bool key_heads_11_interleave_0 = const()[name = string("key_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_11_cast_fp16 = concat(axis = var_891, interleave = key_heads_11_interleave_0, values = (var_1049_cast_fp16, var_1049_cast_fp16, var_1065_cast_fp16, var_1065_cast_fp16, var_1081_cast_fp16, var_1081_cast_fp16, var_1097_cast_fp16, var_1097_cast_fp16, var_1113_cast_fp16, var_1113_cast_fp16, var_1129_cast_fp16, var_1129_cast_fp16, var_1145_cast_fp16, var_1145_cast_fp16, var_1161_cast_fp16, var_1161_cast_fp16))[name = string("key_heads_11_cast_fp16")];
+            bool value_heads_11_interleave_0 = const()[name = string("value_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_11_cast_fp16 = concat(axis = var_891, interleave = value_heads_11_interleave_0, values = (var_1053_cast_fp16, var_1053_cast_fp16, var_1069_cast_fp16, var_1069_cast_fp16, var_1085_cast_fp16, var_1085_cast_fp16, var_1101_cast_fp16, var_1101_cast_fp16, var_1117_cast_fp16, var_1117_cast_fp16, var_1133_cast_fp16, var_1133_cast_fp16, var_1149_cast_fp16, var_1149_cast_fp16, var_1165_cast_fp16, var_1165_cast_fp16))[name = string("value_heads_11_cast_fp16")];
+            fp16 var_1188_to_fp16 = const()[name = string("op_1188_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1189_cast_fp16 = mul(x = mh_q_15_cast_fp16, y = var_1188_to_fp16)[name = string("op_1189_cast_fp16")];
+            bool mh_w_9_transpose_x_0 = const()[name = string("mh_w_9_transpose_x_0"), val = bool(true)];
+            bool mh_w_9_transpose_y_0 = const()[name = string("mh_w_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_9_cast_fp16 = matmul(transpose_x = mh_w_9_transpose_x_0, transpose_y = mh_w_9_transpose_y_0, x = var_1189_cast_fp16, y = key_heads_11_cast_fp16)[name = string("mh_w_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_11_cast_fp16 = add(x = mh_w_9_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_11_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1201_cast_fp16 = softmax(axis = var_873, x = mh_w_11_cast_fp16)[name = string("op_1201_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = value_heads_11_cast_fp16, y = var_1201_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1206 = const()[name = string("op_1206"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_17_cast_fp16 = reshape(shape = var_1206, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            string obj_27_pad_type_0 = const()[name = string("obj_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_27_strides_0 = const()[name = string("obj_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_27_pad_0 = const()[name = string("obj_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_27_dilations_0 = const()[name = string("obj_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_27_groups_0 = const()[name = string("obj_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35689600))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37786816))))[name = string("layers_2_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_27_dilations_0, groups = obj_27_groups_0, pad = obj_27_pad_0, pad_type = obj_27_pad_type_0, strides = obj_27_strides_0, weight = layers_2_self_attn_o_proj_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = string("obj_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_21_cast_fp16 = add(x = inputs_15_cast_fp16, y = obj_27_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1224_to_fp16 = const()[name = string("op_1224_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1225_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1224_to_fp16)[name = string("op_1225_cast_fp16")];
+            fp32 var_1226_epsilon_0 = const()[name = string("op_1226_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1226_cast_fp16 = rsqrt(epsilon = var_1226_epsilon_0, x = var_1225_cast_fp16)[name = string("op_1226_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_1226_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37787392)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_19_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_19_cast_fp16")];
+            string input_21_pad_type_0 = const()[name = string("input_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_21_strides_0 = const()[name = string("input_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_21_pad_0 = const()[name = string("input_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_21_dilations_0 = const()[name = string("input_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_21_groups_0 = const()[name = string("input_21_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37789504))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40935296))))[name = string("layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_21_cast_fp16 = conv(dilations = input_21_dilations_0, groups = input_21_groups_0, pad = input_21_pad_0, pad_type = input_21_pad_type_0, strides = input_21_strides_0, weight = layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("input_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1240_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_1240_cast_fp16")];
+            string var_1246_pad_type_0 = const()[name = string("op_1246_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1246_strides_0 = const()[name = string("op_1246_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1246_pad_0 = const()[name = string("op_1246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1246_dilations_0 = const()[name = string("op_1246_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1246_groups_0 = const()[name = string("op_1246_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40935872))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44081664))))[name = string("layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1246_cast_fp16 = conv(dilations = var_1246_dilations_0, groups = var_1246_groups_0, pad = var_1246_pad_0, pad_type = var_1246_pad_type_0, strides = var_1246_strides_0, weight = layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("op_1246_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_23_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1246_cast_fp16)[name = string("input_23_cast_fp16")];
+            string hidden_states_29_pad_type_0 = const()[name = string("hidden_states_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_29_strides_0 = const()[name = string("hidden_states_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_29_pad_0 = const()[name = string("hidden_states_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_29_dilations_0 = const()[name = string("hidden_states_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_29_groups_0 = const()[name = string("hidden_states_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_2_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44082240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47228032))))[name = string("layers_2_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_29_cast_fp16 = conv(dilations = hidden_states_29_dilations_0, groups = hidden_states_29_groups_0, pad = hidden_states_29_pad_0, pad_type = hidden_states_29_pad_type_0, strides = hidden_states_29_strides_0, weight = layers_2_mlp_down_proj_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_21_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            int32 var_1260 = const()[name = string("op_1260"), val = int32(3)];
+            int32 var_1270 = const()[name = string("op_1270"), val = int32(-2)];
+            int32 var_1278 = const()[name = string("op_1278"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1290_to_fp16 = const()[name = string("op_1290_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1291_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1290_to_fp16)[name = string("op_1291_cast_fp16")];
+            fp32 var_1292_epsilon_0 = const()[name = string("op_1292_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1292_cast_fp16 = rsqrt(epsilon = var_1292_epsilon_0, x = var_1291_cast_fp16)[name = string("op_1292_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1292_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47228608)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_29_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_31_cast_fp16)[name = string("obj_29_cast_fp16")];
+            string query_19_pad_type_0 = const()[name = string("query_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_19_strides_0 = const()[name = string("query_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_19_pad_0 = const()[name = string("query_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_19_dilations_0 = const()[name = string("query_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_19_groups_0 = const()[name = string("query_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47230720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49327936))))[name = string("layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_19_dilations_0, groups = query_19_groups_0, pad = query_19_pad_0, pad_type = query_19_pad_type_0, strides = query_19_strides_0, weight = layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("query_19_cast_fp16")];
+            string current_key_13_pad_type_0 = const()[name = string("current_key_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_13_strides_0 = const()[name = string("current_key_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_13_pad_0 = const()[name = string("current_key_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_13_dilations_0 = const()[name = string("current_key_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_13_groups_0 = const()[name = string("current_key_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49328512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50377152))))[name = string("layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = conv(dilations = current_key_13_dilations_0, groups = current_key_13_groups_0, pad = current_key_13_pad_0, pad_type = current_key_13_pad_type_0, strides = current_key_13_strides_0, weight = layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            string current_value_7_pad_type_0 = const()[name = string("current_value_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_7_strides_0 = const()[name = string("current_value_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_7_pad_0 = const()[name = string("current_value_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_7_dilations_0 = const()[name = string("current_value_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_7_groups_0 = const()[name = string("current_value_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50377728))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51426368))))[name = string("layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_7_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_7_dilations_0, groups = current_value_7_groups_0, pad = current_value_7_pad_0, pad_type = current_value_7_pad_type_0, strides = current_value_7_strides_0, weight = layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_value_7_cast_fp16")];
+            tensor<int32, [4]> var_1329 = const()[name = string("op_1329"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_25_cast_fp16 = reshape(shape = var_1329, x = query_19_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1335_to_fp16 = const()[name = string("op_1335_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1336_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1335_to_fp16)[name = string("op_1336_cast_fp16")];
+            fp32 var_1337_epsilon_0 = const()[name = string("op_1337_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1337_cast_fp16 = rsqrt(epsilon = var_1337_epsilon_0, x = var_1336_cast_fp16)[name = string("op_1337_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_33_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1337_cast_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51426944)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_7_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_33_cast_fp16)[name = string("query_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1345 = const()[name = string("op_1345"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_27_cast_fp16 = reshape(shape = var_1345, x = current_key_13_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1351_to_fp16 = const()[name = string("op_1351_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1352_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1351_to_fp16)[name = string("op_1352_cast_fp16")];
+            fp32 var_1353_epsilon_0 = const()[name = string("op_1353_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1353_cast_fp16 = rsqrt(epsilon = var_1353_epsilon_0, x = var_1352_cast_fp16)[name = string("op_1353_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_35_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1353_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51427264)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_7_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_35_cast_fp16)[name = string("current_key_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1371 = const()[name = string("op_1371"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_19_cast_fp16 = reshape(shape = var_1371, x = query_normed_7_cast_fp16)[name = string("mh_q_19_cast_fp16")];
+            tensor<int32, [4]> var_1373 = const()[name = string("op_1373"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_13_cast_fp16 = reshape(shape = var_1373, x = current_key_normed_7_cast_fp16)[name = string("mh_k_13_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1377_cast_fp16 = mul(x = mh_q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1377_cast_fp16")];
+            tensor<int32, [4]> var_1382_begin_0 = const()[name = string("op_1382_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1382_end_0 = const()[name = string("op_1382_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1382_end_mask_0 = const()[name = string("op_1382_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1382_cast_fp16 = slice_by_index(begin = var_1382_begin_0, end = var_1382_end_0, end_mask = var_1382_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1382_cast_fp16")];
+            tensor<int32, [4]> var_1388_begin_0 = const()[name = string("op_1388_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1388_end_0 = const()[name = string("op_1388_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1388_end_mask_0 = const()[name = string("op_1388_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1388_cast_fp16 = slice_by_index(begin = var_1388_begin_0, end = var_1388_end_0, end_mask = var_1388_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1388_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1390_cast_fp16 = mul(x = var_1388_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1390_cast_fp16")];
+            bool var_1392_interleave_0 = const()[name = string("op_1392_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1392_cast_fp16 = concat(axis = var_1270, interleave = var_1392_interleave_0, values = (var_1390_cast_fp16, var_1382_cast_fp16))[name = string("op_1392_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1393_cast_fp16 = mul(x = var_1392_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1393_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_21_cast_fp16 = add(x = var_1377_cast_fp16, y = var_1393_cast_fp16)[name = string("mh_q_21_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1395_cast_fp16 = mul(x = mh_k_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1395_cast_fp16")];
+            tensor<int32, [4]> var_1400_begin_0 = const()[name = string("op_1400_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1400_end_0 = const()[name = string("op_1400_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1400_end_mask_0 = const()[name = string("op_1400_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1400_cast_fp16 = slice_by_index(begin = var_1400_begin_0, end = var_1400_end_0, end_mask = var_1400_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1400_cast_fp16")];
+            tensor<int32, [4]> var_1406_begin_0 = const()[name = string("op_1406_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1406_end_0 = const()[name = string("op_1406_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1406_end_mask_0 = const()[name = string("op_1406_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1406_cast_fp16 = slice_by_index(begin = var_1406_begin_0, end = var_1406_end_0, end_mask = var_1406_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1406_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1408_cast_fp16 = mul(x = var_1406_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1408_cast_fp16")];
+            bool var_1410_interleave_0 = const()[name = string("op_1410_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1410_cast_fp16 = concat(axis = var_1270, interleave = var_1410_interleave_0, values = (var_1408_cast_fp16, var_1400_cast_fp16))[name = string("op_1410_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1411_cast_fp16 = mul(x = var_1410_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1411_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_15_cast_fp16 = add(x = var_1395_cast_fp16, y = var_1411_cast_fp16)[name = string("mh_k_15_cast_fp16")];
+            tensor<int32, [4]> var_1415 = const()[name = string("op_1415"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_15_cast_fp16 = reshape(shape = var_1415, x = mh_k_15_cast_fp16)[name = string("current_key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1422_cast_fp16 = mul(x = var_84_cast_fp16_3, y = var_260_cast_fp16)[name = string("op_1422_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1423_cast_fp16 = mul(x = current_key_15_cast_fp16, y = var_258_cast_fp16)[name = string("op_1423_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_21_cast_fp16 = add(x = var_1422_cast_fp16, y = var_1423_cast_fp16)[name = string("key_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1426_cast_fp16 = mul(x = var_92_cast_fp16_3, y = var_260_cast_fp16)[name = string("op_1426_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1427_cast_fp16 = mul(x = current_value_7_cast_fp16, y = var_258_cast_fp16)[name = string("op_1427_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_13_cast_fp16 = add(x = var_1426_cast_fp16, y = var_1427_cast_fp16)[name = string("value_13_cast_fp16")];
+            tensor<int32, [4]> var_1431 = const()[name = string("op_1431"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_13_cast_fp16 = reshape(shape = var_1431, x = key_21_cast_fp16)[name = string("key_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1433 = const()[name = string("op_1433"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_13_cast_fp16 = reshape(shape = var_1433, x = value_13_cast_fp16)[name = string("value_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1436_begin_0 = const()[name = string("op_1436_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1436_end_0 = const()[name = string("op_1436_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1436_end_mask_0 = const()[name = string("op_1436_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1436_cast_fp16 = slice_by_index(begin = var_1436_begin_0, end = var_1436_end_0, end_mask = var_1436_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1436_cast_fp16")];
+            tensor<int32, [4]> var_1440_begin_0 = const()[name = string("op_1440_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1440_end_0 = const()[name = string("op_1440_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1440_end_mask_0 = const()[name = string("op_1440_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1440_cast_fp16 = slice_by_index(begin = var_1440_begin_0, end = var_1440_end_0, end_mask = var_1440_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1440_cast_fp16")];
+            tensor<int32, [4]> var_1452_begin_0 = const()[name = string("op_1452_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1452_end_0 = const()[name = string("op_1452_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1452_end_mask_0 = const()[name = string("op_1452_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1452_cast_fp16 = slice_by_index(begin = var_1452_begin_0, end = var_1452_end_0, end_mask = var_1452_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1452_cast_fp16")];
+            tensor<int32, [4]> var_1456_begin_0 = const()[name = string("op_1456_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1456_end_0 = const()[name = string("op_1456_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1456_end_mask_0 = const()[name = string("op_1456_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1456_cast_fp16 = slice_by_index(begin = var_1456_begin_0, end = var_1456_end_0, end_mask = var_1456_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1456_cast_fp16")];
+            tensor<int32, [4]> var_1468_begin_0 = const()[name = string("op_1468_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1468_end_0 = const()[name = string("op_1468_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1468_end_mask_0 = const()[name = string("op_1468_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1468_cast_fp16 = slice_by_index(begin = var_1468_begin_0, end = var_1468_end_0, end_mask = var_1468_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1468_cast_fp16")];
+            tensor<int32, [4]> var_1472_begin_0 = const()[name = string("op_1472_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1472_end_0 = const()[name = string("op_1472_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1472_end_mask_0 = const()[name = string("op_1472_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1472_cast_fp16 = slice_by_index(begin = var_1472_begin_0, end = var_1472_end_0, end_mask = var_1472_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1472_cast_fp16")];
+            tensor<int32, [4]> var_1484_begin_0 = const()[name = string("op_1484_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1484_end_0 = const()[name = string("op_1484_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1484_end_mask_0 = const()[name = string("op_1484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1484_cast_fp16 = slice_by_index(begin = var_1484_begin_0, end = var_1484_end_0, end_mask = var_1484_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1484_cast_fp16")];
+            tensor<int32, [4]> var_1488_begin_0 = const()[name = string("op_1488_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1488_end_0 = const()[name = string("op_1488_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1488_end_mask_0 = const()[name = string("op_1488_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1488_cast_fp16 = slice_by_index(begin = var_1488_begin_0, end = var_1488_end_0, end_mask = var_1488_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1488_cast_fp16")];
+            tensor<int32, [4]> var_1500_begin_0 = const()[name = string("op_1500_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1500_end_0 = const()[name = string("op_1500_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1500_end_mask_0 = const()[name = string("op_1500_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1500_cast_fp16 = slice_by_index(begin = var_1500_begin_0, end = var_1500_end_0, end_mask = var_1500_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1500_cast_fp16")];
+            tensor<int32, [4]> var_1504_begin_0 = const()[name = string("op_1504_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1504_end_0 = const()[name = string("op_1504_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1504_end_mask_0 = const()[name = string("op_1504_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1504_cast_fp16 = slice_by_index(begin = var_1504_begin_0, end = var_1504_end_0, end_mask = var_1504_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1504_cast_fp16")];
+            tensor<int32, [4]> var_1516_begin_0 = const()[name = string("op_1516_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1516_end_0 = const()[name = string("op_1516_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1516_end_mask_0 = const()[name = string("op_1516_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1516_cast_fp16 = slice_by_index(begin = var_1516_begin_0, end = var_1516_end_0, end_mask = var_1516_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1516_cast_fp16")];
+            tensor<int32, [4]> var_1520_begin_0 = const()[name = string("op_1520_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1520_end_0 = const()[name = string("op_1520_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1520_end_mask_0 = const()[name = string("op_1520_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1520_cast_fp16 = slice_by_index(begin = var_1520_begin_0, end = var_1520_end_0, end_mask = var_1520_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1520_cast_fp16")];
+            tensor<int32, [4]> var_1532_begin_0 = const()[name = string("op_1532_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1532_end_0 = const()[name = string("op_1532_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1532_end_mask_0 = const()[name = string("op_1532_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1532_cast_fp16 = slice_by_index(begin = var_1532_begin_0, end = var_1532_end_0, end_mask = var_1532_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1532_cast_fp16")];
+            tensor<int32, [4]> var_1536_begin_0 = const()[name = string("op_1536_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1536_end_0 = const()[name = string("op_1536_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1536_end_mask_0 = const()[name = string("op_1536_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1536_cast_fp16 = slice_by_index(begin = var_1536_begin_0, end = var_1536_end_0, end_mask = var_1536_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1536_cast_fp16")];
+            tensor<int32, [4]> var_1548_begin_0 = const()[name = string("op_1548_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1548_end_0 = const()[name = string("op_1548_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1548_end_mask_0 = const()[name = string("op_1548_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1548_cast_fp16 = slice_by_index(begin = var_1548_begin_0, end = var_1548_end_0, end_mask = var_1548_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1548_cast_fp16")];
+            tensor<int32, [4]> var_1552_begin_0 = const()[name = string("op_1552_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1552_end_0 = const()[name = string("op_1552_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1552_end_mask_0 = const()[name = string("op_1552_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1552_cast_fp16 = slice_by_index(begin = var_1552_begin_0, end = var_1552_end_0, end_mask = var_1552_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1552_cast_fp16")];
+            bool key_heads_15_interleave_0 = const()[name = string("key_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_15_cast_fp16 = concat(axis = var_1278, interleave = key_heads_15_interleave_0, values = (var_1436_cast_fp16, var_1436_cast_fp16, var_1452_cast_fp16, var_1452_cast_fp16, var_1468_cast_fp16, var_1468_cast_fp16, var_1484_cast_fp16, var_1484_cast_fp16, var_1500_cast_fp16, var_1500_cast_fp16, var_1516_cast_fp16, var_1516_cast_fp16, var_1532_cast_fp16, var_1532_cast_fp16, var_1548_cast_fp16, var_1548_cast_fp16))[name = string("key_heads_15_cast_fp16")];
+            bool value_heads_15_interleave_0 = const()[name = string("value_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_15_cast_fp16 = concat(axis = var_1278, interleave = value_heads_15_interleave_0, values = (var_1440_cast_fp16, var_1440_cast_fp16, var_1456_cast_fp16, var_1456_cast_fp16, var_1472_cast_fp16, var_1472_cast_fp16, var_1488_cast_fp16, var_1488_cast_fp16, var_1504_cast_fp16, var_1504_cast_fp16, var_1520_cast_fp16, var_1520_cast_fp16, var_1536_cast_fp16, var_1536_cast_fp16, var_1552_cast_fp16, var_1552_cast_fp16))[name = string("value_heads_15_cast_fp16")];
+            fp16 var_1575_to_fp16 = const()[name = string("op_1575_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1576_cast_fp16 = mul(x = mh_q_21_cast_fp16, y = var_1575_to_fp16)[name = string("op_1576_cast_fp16")];
+            bool mh_w_13_transpose_x_0 = const()[name = string("mh_w_13_transpose_x_0"), val = bool(true)];
+            bool mh_w_13_transpose_y_0 = const()[name = string("mh_w_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_13_cast_fp16 = matmul(transpose_x = mh_w_13_transpose_x_0, transpose_y = mh_w_13_transpose_y_0, x = var_1576_cast_fp16, y = key_heads_15_cast_fp16)[name = string("mh_w_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_15_cast_fp16 = add(x = mh_w_13_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_15_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1588_cast_fp16 = softmax(axis = var_1260, x = mh_w_15_cast_fp16)[name = string("op_1588_cast_fp16")];
+            bool attn_7_transpose_x_0 = const()[name = string("attn_7_transpose_x_0"), val = bool(false)];
+            bool attn_7_transpose_y_0 = const()[name = string("attn_7_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_7_cast_fp16 = matmul(transpose_x = attn_7_transpose_x_0, transpose_y = attn_7_transpose_y_0, x = value_heads_15_cast_fp16, y = var_1588_cast_fp16)[name = string("attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1593 = const()[name = string("op_1593"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_25_cast_fp16 = reshape(shape = var_1593, x = attn_7_cast_fp16)[name = string("input_25_cast_fp16")];
+            string obj_35_pad_type_0 = const()[name = string("obj_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_35_strides_0 = const()[name = string("obj_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_35_pad_0 = const()[name = string("obj_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_35_dilations_0 = const()[name = string("obj_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_35_groups_0 = const()[name = string("obj_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51427584))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53524800))))[name = string("layers_3_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_35_dilations_0, groups = obj_35_groups_0, pad = obj_35_pad_0, pad_type = obj_35_pad_type_0, strides = obj_35_strides_0, weight = layers_3_self_attn_o_proj_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = string("obj_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_29_cast_fp16 = add(x = inputs_23_cast_fp16, y = obj_35_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_1611_to_fp16 = const()[name = string("op_1611_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1612_cast_fp16 = add(x = variance_31_cast_fp16, y = var_1611_to_fp16)[name = string("op_1612_cast_fp16")];
+            fp32 var_1613_epsilon_0 = const()[name = string("op_1613_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1613_cast_fp16 = rsqrt(epsilon = var_1613_epsilon_0, x = var_1612_cast_fp16)[name = string("op_1613_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_37_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1613_cast_fp16)[name = string("hidden_states_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53525376)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_27_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_37_cast_fp16)[name = string("input_27_cast_fp16")];
+            string input_29_pad_type_0 = const()[name = string("input_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_29_strides_0 = const()[name = string("input_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_29_pad_0 = const()[name = string("input_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_29_dilations_0 = const()[name = string("input_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_29_groups_0 = const()[name = string("input_29_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53527488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56673280))))[name = string("layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_29_cast_fp16 = conv(dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("input_29_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1627_cast_fp16 = silu(x = input_29_cast_fp16)[name = string("op_1627_cast_fp16")];
+            string var_1633_pad_type_0 = const()[name = string("op_1633_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1633_strides_0 = const()[name = string("op_1633_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1633_pad_0 = const()[name = string("op_1633_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1633_dilations_0 = const()[name = string("op_1633_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1633_groups_0 = const()[name = string("op_1633_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56673856))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59819648))))[name = string("layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1633_cast_fp16 = conv(dilations = var_1633_dilations_0, groups = var_1633_groups_0, pad = var_1633_pad_0, pad_type = var_1633_pad_type_0, strides = var_1633_strides_0, weight = layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("op_1633_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_31_cast_fp16 = mul(x = var_1627_cast_fp16, y = var_1633_cast_fp16)[name = string("input_31_cast_fp16")];
+            string hidden_states_39_pad_type_0 = const()[name = string("hidden_states_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_39_strides_0 = const()[name = string("hidden_states_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_39_pad_0 = const()[name = string("hidden_states_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_39_dilations_0 = const()[name = string("hidden_states_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_39_groups_0 = const()[name = string("hidden_states_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_3_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59820224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62966016))))[name = string("layers_3_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_39_cast_fp16 = conv(dilations = hidden_states_39_dilations_0, groups = hidden_states_39_groups_0, pad = hidden_states_39_pad_0, pad_type = hidden_states_39_pad_type_0, strides = hidden_states_39_strides_0, weight = layers_3_mlp_down_proj_weight_to_fp16_palettized, x = input_31_cast_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_29_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            int32 var_1647 = const()[name = string("op_1647"), val = int32(3)];
+            int32 var_1657 = const()[name = string("op_1657"), val = int32(-2)];
+            int32 var_1665 = const()[name = string("op_1665"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_33_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_33_cast_fp16")];
+            tensor<int32, [1]> variance_33_axes_0 = const()[name = string("variance_33_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_33_keep_dims_0 = const()[name = string("variance_33_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_33_cast_fp16 = reduce_mean(axes = variance_33_axes_0, keep_dims = variance_33_keep_dims_0, x = inputs_sq_33_cast_fp16)[name = string("variance_33_cast_fp16")];
+            fp16 var_1677_to_fp16 = const()[name = string("op_1677_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1678_cast_fp16 = add(x = variance_33_cast_fp16, y = var_1677_to_fp16)[name = string("op_1678_cast_fp16")];
+            fp32 var_1679_epsilon_0 = const()[name = string("op_1679_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1679_cast_fp16 = rsqrt(epsilon = var_1679_epsilon_0, x = var_1678_cast_fp16)[name = string("op_1679_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_41_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_1679_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_33_to_fp16 = const()[name = string("w_33_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62966592)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_37_cast_fp16 = mul(x = w_33_to_fp16, y = hidden_states_41_cast_fp16)[name = string("obj_37_cast_fp16")];
+            string query_25_pad_type_0 = const()[name = string("query_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_25_strides_0 = const()[name = string("query_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_25_pad_0 = const()[name = string("query_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_25_dilations_0 = const()[name = string("query_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_25_groups_0 = const()[name = string("query_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62968704))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65065920))))[name = string("layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_25_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_25_dilations_0, groups = query_25_groups_0, pad = query_25_pad_0, pad_type = query_25_pad_type_0, strides = query_25_strides_0, weight = layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("query_25_cast_fp16")];
+            string current_key_17_pad_type_0 = const()[name = string("current_key_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_17_strides_0 = const()[name = string("current_key_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_17_pad_0 = const()[name = string("current_key_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_17_dilations_0 = const()[name = string("current_key_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_17_groups_0 = const()[name = string("current_key_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65066496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66115136))))[name = string("layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_17_cast_fp16 = conv(dilations = current_key_17_dilations_0, groups = current_key_17_groups_0, pad = current_key_17_pad_0, pad_type = current_key_17_pad_type_0, strides = current_key_17_strides_0, weight = layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_key_17_cast_fp16")];
+            string current_value_pad_type_0 = const()[name = string("current_value_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_strides_0 = const()[name = string("current_value_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_pad_0 = const()[name = string("current_value_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_dilations_0 = const()[name = string("current_value_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_groups_0 = const()[name = string("current_value_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66115712))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67164352))))[name = string("layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_dilations_0, groups = current_value_groups_0, pad = current_value_pad_0, pad_type = current_value_pad_type_0, strides = current_value_strides_0, weight = layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_value_cast_fp16")];
+            tensor<int32, [4]> var_1716 = const()[name = string("op_1716"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_33_cast_fp16 = reshape(shape = var_1716, x = query_25_cast_fp16)[name = string("inputs_33_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_35_cast_fp16 = mul(x = inputs_33_cast_fp16, y = inputs_33_cast_fp16)[name = string("inputs_sq_35_cast_fp16")];
+            tensor<int32, [1]> variance_35_axes_0 = const()[name = string("variance_35_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_35_keep_dims_0 = const()[name = string("variance_35_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_35_cast_fp16 = reduce_mean(axes = variance_35_axes_0, keep_dims = variance_35_keep_dims_0, x = inputs_sq_35_cast_fp16)[name = string("variance_35_cast_fp16")];
+            fp16 var_1722_to_fp16 = const()[name = string("op_1722_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1723_cast_fp16 = add(x = variance_35_cast_fp16, y = var_1722_to_fp16)[name = string("op_1723_cast_fp16")];
+            fp32 var_1724_epsilon_0 = const()[name = string("op_1724_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1724_cast_fp16 = rsqrt(epsilon = var_1724_epsilon_0, x = var_1723_cast_fp16)[name = string("op_1724_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_43_cast_fp16 = mul(x = inputs_33_cast_fp16, y = var_1724_cast_fp16)[name = string("hidden_states_43_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_35_to_fp16 = const()[name = string("w_35_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67164928)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_cast_fp16 = mul(x = w_35_to_fp16, y = hidden_states_43_cast_fp16)[name = string("query_normed_cast_fp16")];
+            tensor<int32, [4]> var_1732 = const()[name = string("op_1732"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_35_cast_fp16 = reshape(shape = var_1732, x = current_key_17_cast_fp16)[name = string("inputs_35_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_37_cast_fp16 = mul(x = inputs_35_cast_fp16, y = inputs_35_cast_fp16)[name = string("inputs_sq_37_cast_fp16")];
+            tensor<int32, [1]> variance_37_axes_0 = const()[name = string("variance_37_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_37_keep_dims_0 = const()[name = string("variance_37_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_37_cast_fp16 = reduce_mean(axes = variance_37_axes_0, keep_dims = variance_37_keep_dims_0, x = inputs_sq_37_cast_fp16)[name = string("variance_37_cast_fp16")];
+            fp16 var_1738_to_fp16 = const()[name = string("op_1738_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1739_cast_fp16 = add(x = variance_37_cast_fp16, y = var_1738_to_fp16)[name = string("op_1739_cast_fp16")];
+            fp32 var_1740_epsilon_0 = const()[name = string("op_1740_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1740_cast_fp16 = rsqrt(epsilon = var_1740_epsilon_0, x = var_1739_cast_fp16)[name = string("op_1740_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_45_cast_fp16 = mul(x = inputs_35_cast_fp16, y = var_1740_cast_fp16)[name = string("hidden_states_45_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_37_to_fp16 = const()[name = string("w_37_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67165248)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_cast_fp16 = mul(x = w_37_to_fp16, y = hidden_states_45_cast_fp16)[name = string("current_key_normed_cast_fp16")];
+            tensor<int32, [4]> var_1758 = const()[name = string("op_1758"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_25_cast_fp16 = reshape(shape = var_1758, x = query_normed_cast_fp16)[name = string("mh_q_25_cast_fp16")];
+            tensor<int32, [4]> var_1760 = const()[name = string("op_1760"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_17_cast_fp16 = reshape(shape = var_1760, x = current_key_normed_cast_fp16)[name = string("mh_k_17_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1764_cast_fp16 = mul(x = mh_q_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1764_cast_fp16")];
+            tensor<int32, [4]> var_1769_begin_0 = const()[name = string("op_1769_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1769_end_0 = const()[name = string("op_1769_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1769_end_mask_0 = const()[name = string("op_1769_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1769_cast_fp16 = slice_by_index(begin = var_1769_begin_0, end = var_1769_end_0, end_mask = var_1769_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1769_cast_fp16")];
+            tensor<int32, [4]> var_1775_begin_0 = const()[name = string("op_1775_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1775_end_0 = const()[name = string("op_1775_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1775_end_mask_0 = const()[name = string("op_1775_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1775_cast_fp16 = slice_by_index(begin = var_1775_begin_0, end = var_1775_end_0, end_mask = var_1775_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1775_cast_fp16")];
+            fp16 const_109_promoted_to_fp16 = const()[name = string("const_109_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1777_cast_fp16 = mul(x = var_1775_cast_fp16, y = const_109_promoted_to_fp16)[name = string("op_1777_cast_fp16")];
+            bool var_1779_interleave_0 = const()[name = string("op_1779_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1779_cast_fp16 = concat(axis = var_1657, interleave = var_1779_interleave_0, values = (var_1777_cast_fp16, var_1769_cast_fp16))[name = string("op_1779_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1780_cast_fp16 = mul(x = var_1779_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1780_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_27_cast_fp16 = add(x = var_1764_cast_fp16, y = var_1780_cast_fp16)[name = string("mh_q_27_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1782_cast_fp16 = mul(x = mh_k_17_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1782_cast_fp16")];
+            tensor<int32, [4]> var_1787_begin_0 = const()[name = string("op_1787_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1787_end_0 = const()[name = string("op_1787_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1787_end_mask_0 = const()[name = string("op_1787_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1787_cast_fp16 = slice_by_index(begin = var_1787_begin_0, end = var_1787_end_0, end_mask = var_1787_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1787_cast_fp16")];
+            tensor<int32, [4]> var_1793_begin_0 = const()[name = string("op_1793_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1793_end_0 = const()[name = string("op_1793_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1793_end_mask_0 = const()[name = string("op_1793_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1793_cast_fp16 = slice_by_index(begin = var_1793_begin_0, end = var_1793_end_0, end_mask = var_1793_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1793_cast_fp16")];
+            fp16 const_112_promoted_to_fp16 = const()[name = string("const_112_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1795_cast_fp16 = mul(x = var_1793_cast_fp16, y = const_112_promoted_to_fp16)[name = string("op_1795_cast_fp16")];
+            bool var_1797_interleave_0 = const()[name = string("op_1797_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1797_cast_fp16 = concat(axis = var_1657, interleave = var_1797_interleave_0, values = (var_1795_cast_fp16, var_1787_cast_fp16))[name = string("op_1797_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1798_cast_fp16 = mul(x = var_1797_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1798_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_cast_fp16 = add(x = var_1782_cast_fp16, y = var_1798_cast_fp16)[name = string("mh_k_cast_fp16")];
+            tensor<int32, [4]> var_1802 = const()[name = string("op_1802"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_1802, x = mh_k_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1809_cast_fp16 = mul(x = var_84_cast_fp16_4, y = var_260_cast_fp16)[name = string("op_1809_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1810_cast_fp16 = mul(x = current_key_cast_fp16, y = var_258_cast_fp16)[name = string("op_1810_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_27_cast_fp16 = add(x = var_1809_cast_fp16, y = var_1810_cast_fp16)[name = string("key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1813_cast_fp16 = mul(x = var_92_cast_fp16_4, y = var_260_cast_fp16)[name = string("op_1813_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1814_cast_fp16 = mul(x = current_value_cast_fp16, y = var_258_cast_fp16)[name = string("op_1814_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_17_cast_fp16 = add(x = var_1813_cast_fp16, y = var_1814_cast_fp16)[name = string("value_17_cast_fp16")];
+            tensor<int32, [4]> var_1818 = const()[name = string("op_1818"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_17_cast_fp16 = reshape(shape = var_1818, x = key_27_cast_fp16)[name = string("key_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1820 = const()[name = string("op_1820"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_17_cast_fp16 = reshape(shape = var_1820, x = value_17_cast_fp16)[name = string("value_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1823_begin_0 = const()[name = string("op_1823_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1823_end_0 = const()[name = string("op_1823_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1823_end_mask_0 = const()[name = string("op_1823_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1823_cast_fp16 = slice_by_index(begin = var_1823_begin_0, end = var_1823_end_0, end_mask = var_1823_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1823_cast_fp16")];
+            tensor<int32, [4]> var_1827_begin_0 = const()[name = string("op_1827_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1827_end_0 = const()[name = string("op_1827_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1827_end_mask_0 = const()[name = string("op_1827_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1827_cast_fp16 = slice_by_index(begin = var_1827_begin_0, end = var_1827_end_0, end_mask = var_1827_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1827_cast_fp16")];
+            tensor<int32, [4]> var_1839_begin_0 = const()[name = string("op_1839_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1839_end_0 = const()[name = string("op_1839_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1839_end_mask_0 = const()[name = string("op_1839_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1839_cast_fp16 = slice_by_index(begin = var_1839_begin_0, end = var_1839_end_0, end_mask = var_1839_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1839_cast_fp16")];
+            tensor<int32, [4]> var_1843_begin_0 = const()[name = string("op_1843_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1843_end_0 = const()[name = string("op_1843_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1843_end_mask_0 = const()[name = string("op_1843_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1843_cast_fp16 = slice_by_index(begin = var_1843_begin_0, end = var_1843_end_0, end_mask = var_1843_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1843_cast_fp16")];
+            tensor<int32, [4]> var_1855_begin_0 = const()[name = string("op_1855_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1855_end_0 = const()[name = string("op_1855_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1855_end_mask_0 = const()[name = string("op_1855_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1855_cast_fp16 = slice_by_index(begin = var_1855_begin_0, end = var_1855_end_0, end_mask = var_1855_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1855_cast_fp16")];
+            tensor<int32, [4]> var_1859_begin_0 = const()[name = string("op_1859_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1859_end_0 = const()[name = string("op_1859_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1859_end_mask_0 = const()[name = string("op_1859_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1859_cast_fp16 = slice_by_index(begin = var_1859_begin_0, end = var_1859_end_0, end_mask = var_1859_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1859_cast_fp16")];
+            tensor<int32, [4]> var_1871_begin_0 = const()[name = string("op_1871_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1871_end_0 = const()[name = string("op_1871_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1871_end_mask_0 = const()[name = string("op_1871_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1871_cast_fp16 = slice_by_index(begin = var_1871_begin_0, end = var_1871_end_0, end_mask = var_1871_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1871_cast_fp16")];
+            tensor<int32, [4]> var_1875_begin_0 = const()[name = string("op_1875_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1875_end_0 = const()[name = string("op_1875_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1875_end_mask_0 = const()[name = string("op_1875_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1875_cast_fp16 = slice_by_index(begin = var_1875_begin_0, end = var_1875_end_0, end_mask = var_1875_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1875_cast_fp16")];
+            tensor<int32, [4]> var_1887_begin_0 = const()[name = string("op_1887_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1887_end_0 = const()[name = string("op_1887_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1887_end_mask_0 = const()[name = string("op_1887_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1887_cast_fp16 = slice_by_index(begin = var_1887_begin_0, end = var_1887_end_0, end_mask = var_1887_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1887_cast_fp16")];
+            tensor<int32, [4]> var_1891_begin_0 = const()[name = string("op_1891_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1891_end_0 = const()[name = string("op_1891_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1891_end_mask_0 = const()[name = string("op_1891_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1891_cast_fp16 = slice_by_index(begin = var_1891_begin_0, end = var_1891_end_0, end_mask = var_1891_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1891_cast_fp16")];
+            tensor<int32, [4]> var_1903_begin_0 = const()[name = string("op_1903_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1903_end_0 = const()[name = string("op_1903_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1903_end_mask_0 = const()[name = string("op_1903_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1903_cast_fp16 = slice_by_index(begin = var_1903_begin_0, end = var_1903_end_0, end_mask = var_1903_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1903_cast_fp16")];
+            tensor<int32, [4]> var_1907_begin_0 = const()[name = string("op_1907_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1907_end_0 = const()[name = string("op_1907_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1907_end_mask_0 = const()[name = string("op_1907_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1907_cast_fp16 = slice_by_index(begin = var_1907_begin_0, end = var_1907_end_0, end_mask = var_1907_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1907_cast_fp16")];
+            tensor<int32, [4]> var_1919_begin_0 = const()[name = string("op_1919_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1919_end_0 = const()[name = string("op_1919_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1919_end_mask_0 = const()[name = string("op_1919_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1919_cast_fp16 = slice_by_index(begin = var_1919_begin_0, end = var_1919_end_0, end_mask = var_1919_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1919_cast_fp16")];
+            tensor<int32, [4]> var_1923_begin_0 = const()[name = string("op_1923_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1923_end_0 = const()[name = string("op_1923_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1923_end_mask_0 = const()[name = string("op_1923_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1923_cast_fp16 = slice_by_index(begin = var_1923_begin_0, end = var_1923_end_0, end_mask = var_1923_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1923_cast_fp16")];
+            tensor<int32, [4]> var_1935_begin_0 = const()[name = string("op_1935_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1935_end_0 = const()[name = string("op_1935_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1935_end_mask_0 = const()[name = string("op_1935_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1935_cast_fp16 = slice_by_index(begin = var_1935_begin_0, end = var_1935_end_0, end_mask = var_1935_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1935_cast_fp16")];
+            tensor<int32, [4]> var_1939_begin_0 = const()[name = string("op_1939_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1939_end_0 = const()[name = string("op_1939_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1939_end_mask_0 = const()[name = string("op_1939_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1939_cast_fp16 = slice_by_index(begin = var_1939_begin_0, end = var_1939_end_0, end_mask = var_1939_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1939_cast_fp16")];
+            bool key_heads_interleave_0 = const()[name = string("key_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_cast_fp16 = concat(axis = var_1665, interleave = key_heads_interleave_0, values = (var_1823_cast_fp16, var_1823_cast_fp16, var_1839_cast_fp16, var_1839_cast_fp16, var_1855_cast_fp16, var_1855_cast_fp16, var_1871_cast_fp16, var_1871_cast_fp16, var_1887_cast_fp16, var_1887_cast_fp16, var_1903_cast_fp16, var_1903_cast_fp16, var_1919_cast_fp16, var_1919_cast_fp16, var_1935_cast_fp16, var_1935_cast_fp16))[name = string("key_heads_cast_fp16")];
+            bool value_heads_interleave_0 = const()[name = string("value_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_cast_fp16 = concat(axis = var_1665, interleave = value_heads_interleave_0, values = (var_1827_cast_fp16, var_1827_cast_fp16, var_1843_cast_fp16, var_1843_cast_fp16, var_1859_cast_fp16, var_1859_cast_fp16, var_1875_cast_fp16, var_1875_cast_fp16, var_1891_cast_fp16, var_1891_cast_fp16, var_1907_cast_fp16, var_1907_cast_fp16, var_1923_cast_fp16, var_1923_cast_fp16, var_1939_cast_fp16, var_1939_cast_fp16))[name = string("value_heads_cast_fp16")];
+            fp16 var_1962_to_fp16 = const()[name = string("op_1962_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1963_cast_fp16 = mul(x = mh_q_27_cast_fp16, y = var_1962_to_fp16)[name = string("op_1963_cast_fp16")];
+            bool mh_w_17_transpose_x_0 = const()[name = string("mh_w_17_transpose_x_0"), val = bool(true)];
+            bool mh_w_17_transpose_y_0 = const()[name = string("mh_w_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_17_cast_fp16 = matmul(transpose_x = mh_w_17_transpose_x_0, transpose_y = mh_w_17_transpose_y_0, x = var_1963_cast_fp16, y = key_heads_cast_fp16)[name = string("mh_w_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_cast_fp16 = add(x = mh_w_17_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1975_cast_fp16 = softmax(axis = var_1647, x = mh_w_cast_fp16)[name = string("op_1975_cast_fp16")];
+            bool attn_transpose_x_0 = const()[name = string("attn_transpose_x_0"), val = bool(false)];
+            bool attn_transpose_y_0 = const()[name = string("attn_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_cast_fp16 = matmul(transpose_x = attn_transpose_x_0, transpose_y = attn_transpose_y_0, x = value_heads_cast_fp16, y = var_1975_cast_fp16)[name = string("attn_cast_fp16")];
+            tensor<int32, [4]> var_1980 = const()[name = string("op_1980"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_33_cast_fp16 = reshape(shape = var_1980, x = attn_cast_fp16)[name = string("input_33_cast_fp16")];
+            string obj_pad_type_0 = const()[name = string("obj_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_strides_0 = const()[name = string("obj_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_pad_0 = const()[name = string("obj_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_dilations_0 = const()[name = string("obj_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_groups_0 = const()[name = string("obj_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67165568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69262784))))[name = string("layers_4_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_dilations_0, groups = obj_groups_0, pad = obj_pad_0, pad_type = obj_pad_type_0, strides = obj_strides_0, weight = layers_4_self_attn_o_proj_weight_to_fp16_palettized, x = input_33_cast_fp16)[name = string("obj_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_37_cast_fp16 = add(x = inputs_31_cast_fp16, y = obj_cast_fp16)[name = string("inputs_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_39_cast_fp16 = mul(x = inputs_37_cast_fp16, y = inputs_37_cast_fp16)[name = string("inputs_sq_39_cast_fp16")];
+            tensor<int32, [1]> variance_39_axes_0 = const()[name = string("variance_39_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_39_keep_dims_0 = const()[name = string("variance_39_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_39_cast_fp16 = reduce_mean(axes = variance_39_axes_0, keep_dims = variance_39_keep_dims_0, x = inputs_sq_39_cast_fp16)[name = string("variance_39_cast_fp16")];
+            fp16 var_1998_to_fp16 = const()[name = string("op_1998_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1999_cast_fp16 = add(x = variance_39_cast_fp16, y = var_1998_to_fp16)[name = string("op_1999_cast_fp16")];
+            fp32 var_2000_epsilon_0 = const()[name = string("op_2000_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2000_cast_fp16 = rsqrt(epsilon = var_2000_epsilon_0, x = var_1999_cast_fp16)[name = string("op_2000_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_47_cast_fp16 = mul(x = inputs_37_cast_fp16, y = var_2000_cast_fp16)[name = string("hidden_states_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_39_to_fp16 = const()[name = string("w_39_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69263360)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_35_cast_fp16 = mul(x = w_39_to_fp16, y = hidden_states_47_cast_fp16)[name = string("input_35_cast_fp16")];
+            string input_37_pad_type_0 = const()[name = string("input_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_37_strides_0 = const()[name = string("input_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_37_dilations_0 = const()[name = string("input_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_37_groups_0 = const()[name = string("input_37_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69265472))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72411264))))[name = string("layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_37_cast_fp16 = conv(dilations = input_37_dilations_0, groups = input_37_groups_0, pad = input_37_pad_0, pad_type = input_37_pad_type_0, strides = input_37_strides_0, weight = layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("input_37_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2014_cast_fp16 = silu(x = input_37_cast_fp16)[name = string("op_2014_cast_fp16")];
+            string var_2020_pad_type_0 = const()[name = string("op_2020_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2020_strides_0 = const()[name = string("op_2020_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2020_pad_0 = const()[name = string("op_2020_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2020_dilations_0 = const()[name = string("op_2020_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2020_groups_0 = const()[name = string("op_2020_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72411840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75557632))))[name = string("layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2020_cast_fp16 = conv(dilations = var_2020_dilations_0, groups = var_2020_groups_0, pad = var_2020_pad_0, pad_type = var_2020_pad_type_0, strides = var_2020_strides_0, weight = layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_2020_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_39_cast_fp16 = mul(x = var_2014_cast_fp16, y = var_2020_cast_fp16)[name = string("input_39_cast_fp16")];
+            string hidden_states_49_pad_type_0 = const()[name = string("hidden_states_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_49_strides_0 = const()[name = string("hidden_states_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_49_pad_0 = const()[name = string("hidden_states_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_49_dilations_0 = const()[name = string("hidden_states_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_49_groups_0 = const()[name = string("hidden_states_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_4_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75558208))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78704000))))[name = string("layers_4_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_49_cast_fp16 = conv(dilations = hidden_states_49_dilations_0, groups = hidden_states_49_groups_0, pad = hidden_states_49_pad_0, pad_type = hidden_states_49_pad_type_0, strides = hidden_states_49_strides_0, weight = layers_4_mlp_down_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_cast_fp16 = add(x = inputs_37_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_2041_to_fp16 = const()[name = string("op_2041_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2042_cast_fp16 = add(x = variance_cast_fp16, y = var_2041_to_fp16)[name = string("op_2042_cast_fp16")];
+            fp32 var_2043_epsilon_0 = const()[name = string("op_2043_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2043_cast_fp16 = rsqrt(epsilon = var_2043_epsilon_0, x = var_2042_cast_fp16)[name = string("op_2043_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_2043_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78704576)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_cast_fp16 = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_cast_fp16")];
+            string logits_1_pad_type_0 = const()[name = string("logits_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_1_strides_0 = const()[name = string("logits_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_1_pad_0 = const()[name = string("logits_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_1_dilations_0 = const()[name = string("logits_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_1_groups_0 = const()[name = string("logits_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_0_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78706688))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80803904))))[name = string("lm_heads_0_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_1_cast_fp16 = conv(dilations = logits_1_dilations_0, groups = logits_1_groups_0, pad = logits_1_pad_0, pad_type = logits_1_pad_type_0, strides = logits_1_strides_0, weight = lm_heads_0_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_1_cast_fp16")];
+            tensor<int32, [1]> var_2060_axes_0 = const()[name = string("op_2060_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2060_cast_fp16 = squeeze(axes = var_2060_axes_0, x = logits_1_cast_fp16)[name = string("op_2060_cast_fp16")];
+            string logits_3_pad_type_0 = const()[name = string("logits_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_3_strides_0 = const()[name = string("logits_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_3_pad_0 = const()[name = string("logits_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_3_dilations_0 = const()[name = string("logits_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_3_groups_0 = const()[name = string("logits_3_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80804480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82901696))))[name = string("lm_heads_1_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_3_cast_fp16 = conv(dilations = logits_3_dilations_0, groups = logits_3_groups_0, pad = logits_3_pad_0, pad_type = logits_3_pad_type_0, strides = logits_3_strides_0, weight = lm_heads_1_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_3_cast_fp16")];
+            tensor<int32, [1]> var_2076_axes_0 = const()[name = string("op_2076_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2076_cast_fp16 = squeeze(axes = var_2076_axes_0, x = logits_3_cast_fp16)[name = string("op_2076_cast_fp16")];
+            string logits_5_pad_type_0 = const()[name = string("logits_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_5_strides_0 = const()[name = string("logits_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_5_pad_0 = const()[name = string("logits_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_5_dilations_0 = const()[name = string("logits_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_5_groups_0 = const()[name = string("logits_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_2_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82902272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84999488))))[name = string("lm_heads_2_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_5_cast_fp16 = conv(dilations = logits_5_dilations_0, groups = logits_5_groups_0, pad = logits_5_pad_0, pad_type = logits_5_pad_type_0, strides = logits_5_strides_0, weight = lm_heads_2_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_5_cast_fp16")];
+            tensor<int32, [1]> var_2092_axes_0 = const()[name = string("op_2092_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2092_cast_fp16 = squeeze(axes = var_2092_axes_0, x = logits_5_cast_fp16)[name = string("op_2092_cast_fp16")];
+            string logits_7_pad_type_0 = const()[name = string("logits_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_7_strides_0 = const()[name = string("logits_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_7_pad_0 = const()[name = string("logits_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_7_dilations_0 = const()[name = string("logits_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_7_groups_0 = const()[name = string("logits_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_3_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85000064))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87097280))))[name = string("lm_heads_3_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_7_cast_fp16 = conv(dilations = logits_7_dilations_0, groups = logits_7_groups_0, pad = logits_7_pad_0, pad_type = logits_7_pad_type_0, strides = logits_7_strides_0, weight = lm_heads_3_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_7_cast_fp16")];
+            tensor<int32, [1]> var_2108_axes_0 = const()[name = string("op_2108_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2108_cast_fp16 = squeeze(axes = var_2108_axes_0, x = logits_7_cast_fp16)[name = string("op_2108_cast_fp16")];
+            string logits_9_pad_type_0 = const()[name = string("logits_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_9_strides_0 = const()[name = string("logits_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_9_pad_0 = const()[name = string("logits_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_9_dilations_0 = const()[name = string("logits_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_9_groups_0 = const()[name = string("logits_9_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_4_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87097856))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89195072))))[name = string("lm_heads_4_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_9_cast_fp16 = conv(dilations = logits_9_dilations_0, groups = logits_9_groups_0, pad = logits_9_pad_0, pad_type = logits_9_pad_type_0, strides = logits_9_strides_0, weight = lm_heads_4_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_9_cast_fp16")];
+            tensor<int32, [1]> var_2124_axes_0 = const()[name = string("op_2124_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2124_cast_fp16 = squeeze(axes = var_2124_axes_0, x = logits_9_cast_fp16)[name = string("op_2124_cast_fp16")];
+            string logits_11_pad_type_0 = const()[name = string("logits_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_11_strides_0 = const()[name = string("logits_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_11_pad_0 = const()[name = string("logits_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_11_dilations_0 = const()[name = string("logits_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_11_groups_0 = const()[name = string("logits_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_5_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89195648))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91292864))))[name = string("lm_heads_5_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_11_cast_fp16 = conv(dilations = logits_11_dilations_0, groups = logits_11_groups_0, pad = logits_11_pad_0, pad_type = logits_11_pad_type_0, strides = logits_11_strides_0, weight = lm_heads_5_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_11_cast_fp16")];
+            tensor<int32, [1]> var_2140_axes_0 = const()[name = string("op_2140_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2140_cast_fp16 = squeeze(axes = var_2140_axes_0, x = logits_11_cast_fp16)[name = string("op_2140_cast_fp16")];
+            string logits_13_pad_type_0 = const()[name = string("logits_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_13_strides_0 = const()[name = string("logits_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_13_pad_0 = const()[name = string("logits_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_13_dilations_0 = const()[name = string("logits_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_13_groups_0 = const()[name = string("logits_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_6_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91293440))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93390656))))[name = string("lm_heads_6_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_13_cast_fp16 = conv(dilations = logits_13_dilations_0, groups = logits_13_groups_0, pad = logits_13_pad_0, pad_type = logits_13_pad_type_0, strides = logits_13_strides_0, weight = lm_heads_6_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_13_cast_fp16")];
+            tensor<int32, [1]> var_2156_axes_0 = const()[name = string("op_2156_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2156_cast_fp16 = squeeze(axes = var_2156_axes_0, x = logits_13_cast_fp16)[name = string("op_2156_cast_fp16")];
+            string logits_15_pad_type_0 = const()[name = string("logits_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_15_strides_0 = const()[name = string("logits_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_15_pad_0 = const()[name = string("logits_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_15_dilations_0 = const()[name = string("logits_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_15_groups_0 = const()[name = string("logits_15_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_7_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93391232))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95488448))))[name = string("lm_heads_7_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_15_cast_fp16 = conv(dilations = logits_15_dilations_0, groups = logits_15_groups_0, pad = logits_15_pad_0, pad_type = logits_15_pad_type_0, strides = logits_15_strides_0, weight = lm_heads_7_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_15_cast_fp16")];
+            tensor<int32, [1]> var_2172_axes_0 = const()[name = string("op_2172_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2172_cast_fp16 = squeeze(axes = var_2172_axes_0, x = logits_15_cast_fp16)[name = string("op_2172_cast_fp16")];
+            string logits_17_pad_type_0 = const()[name = string("logits_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_17_strides_0 = const()[name = string("logits_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_17_pad_0 = const()[name = string("logits_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_17_dilations_0 = const()[name = string("logits_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_17_groups_0 = const()[name = string("logits_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_8_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95489024))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97586240))))[name = string("lm_heads_8_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_17_cast_fp16 = conv(dilations = logits_17_dilations_0, groups = logits_17_groups_0, pad = logits_17_pad_0, pad_type = logits_17_pad_type_0, strides = logits_17_strides_0, weight = lm_heads_8_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_17_cast_fp16")];
+            tensor<int32, [1]> var_2188_axes_0 = const()[name = string("op_2188_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2188_cast_fp16 = squeeze(axes = var_2188_axes_0, x = logits_17_cast_fp16)[name = string("op_2188_cast_fp16")];
+            string logits_19_pad_type_0 = const()[name = string("logits_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_19_strides_0 = const()[name = string("logits_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_19_pad_0 = const()[name = string("logits_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_19_dilations_0 = const()[name = string("logits_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_19_groups_0 = const()[name = string("logits_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_9_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97586816))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99684032))))[name = string("lm_heads_9_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_19_cast_fp16 = conv(dilations = logits_19_dilations_0, groups = logits_19_groups_0, pad = logits_19_pad_0, pad_type = logits_19_pad_type_0, strides = logits_19_strides_0, weight = lm_heads_9_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_19_cast_fp16")];
+            tensor<int32, [1]> var_2204_axes_0 = const()[name = string("op_2204_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2204_cast_fp16 = squeeze(axes = var_2204_axes_0, x = logits_19_cast_fp16)[name = string("op_2204_cast_fp16")];
+            string logits_21_pad_type_0 = const()[name = string("logits_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_21_strides_0 = const()[name = string("logits_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_21_pad_0 = const()[name = string("logits_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_21_dilations_0 = const()[name = string("logits_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_21_groups_0 = const()[name = string("logits_21_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_10_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99684608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101781824))))[name = string("lm_heads_10_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_21_cast_fp16 = conv(dilations = logits_21_dilations_0, groups = logits_21_groups_0, pad = logits_21_pad_0, pad_type = logits_21_pad_type_0, strides = logits_21_strides_0, weight = lm_heads_10_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_21_cast_fp16")];
+            tensor<int32, [1]> var_2220_axes_0 = const()[name = string("op_2220_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2220_cast_fp16 = squeeze(axes = var_2220_axes_0, x = logits_21_cast_fp16)[name = string("op_2220_cast_fp16")];
+            string logits_23_pad_type_0 = const()[name = string("logits_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_23_strides_0 = const()[name = string("logits_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_23_pad_0 = const()[name = string("logits_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_23_dilations_0 = const()[name = string("logits_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_23_groups_0 = const()[name = string("logits_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_11_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101782400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103879616))))[name = string("lm_heads_11_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_23_cast_fp16 = conv(dilations = logits_23_dilations_0, groups = logits_23_groups_0, pad = logits_23_pad_0, pad_type = logits_23_pad_type_0, strides = logits_23_strides_0, weight = lm_heads_11_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_23_cast_fp16")];
+            tensor<int32, [1]> var_2236_axes_0 = const()[name = string("op_2236_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2236_cast_fp16 = squeeze(axes = var_2236_axes_0, x = logits_23_cast_fp16)[name = string("op_2236_cast_fp16")];
+            string logits_25_pad_type_0 = const()[name = string("logits_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_25_strides_0 = const()[name = string("logits_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_25_pad_0 = const()[name = string("logits_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_25_dilations_0 = const()[name = string("logits_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_25_groups_0 = const()[name = string("logits_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_12_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103880192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105977408))))[name = string("lm_heads_12_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_25_cast_fp16 = conv(dilations = logits_25_dilations_0, groups = logits_25_groups_0, pad = logits_25_pad_0, pad_type = logits_25_pad_type_0, strides = logits_25_strides_0, weight = lm_heads_12_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_25_cast_fp16")];
+            tensor<int32, [1]> var_2252_axes_0 = const()[name = string("op_2252_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2252_cast_fp16 = squeeze(axes = var_2252_axes_0, x = logits_25_cast_fp16)[name = string("op_2252_cast_fp16")];
+            string logits_27_pad_type_0 = const()[name = string("logits_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_27_strides_0 = const()[name = string("logits_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_27_pad_0 = const()[name = string("logits_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_27_dilations_0 = const()[name = string("logits_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_27_groups_0 = const()[name = string("logits_27_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_13_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105977984))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108075200))))[name = string("lm_heads_13_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_27_cast_fp16 = conv(dilations = logits_27_dilations_0, groups = logits_27_groups_0, pad = logits_27_pad_0, pad_type = logits_27_pad_type_0, strides = logits_27_strides_0, weight = lm_heads_13_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_27_cast_fp16")];
+            tensor<int32, [1]> var_2268_axes_0 = const()[name = string("op_2268_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2268_cast_fp16 = squeeze(axes = var_2268_axes_0, x = logits_27_cast_fp16)[name = string("op_2268_cast_fp16")];
+            string logits_29_pad_type_0 = const()[name = string("logits_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_29_strides_0 = const()[name = string("logits_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_29_pad_0 = const()[name = string("logits_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_29_dilations_0 = const()[name = string("logits_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_29_groups_0 = const()[name = string("logits_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_14_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108075776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110172992))))[name = string("lm_heads_14_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_29_cast_fp16 = conv(dilations = logits_29_dilations_0, groups = logits_29_groups_0, pad = logits_29_pad_0, pad_type = logits_29_pad_type_0, strides = logits_29_strides_0, weight = lm_heads_14_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_29_cast_fp16")];
+            tensor<int32, [1]> var_2284_axes_0 = const()[name = string("op_2284_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2284_cast_fp16 = squeeze(axes = var_2284_axes_0, x = logits_29_cast_fp16)[name = string("op_2284_cast_fp16")];
+            bool var_2290_interleave_0 = const()[name = string("op_2290_interleave_0"), val = bool(false)];
+            int32 const_119 = const()[name = string("const_119"), val = int32(2)];
+            tensor<fp16, [1, 2048, 15]> var_2290_cast_fp16 = concat(axis = const_119, interleave = var_2290_interleave_0, values = (var_2060_cast_fp16, var_2076_cast_fp16, var_2092_cast_fp16, var_2108_cast_fp16, var_2124_cast_fp16, var_2140_cast_fp16, var_2156_cast_fp16, var_2172_cast_fp16, var_2188_cast_fp16, var_2204_cast_fp16, var_2220_cast_fp16, var_2236_cast_fp16, var_2252_cast_fp16, var_2268_cast_fp16, var_2284_cast_fp16))[name = string("op_2290_cast_fp16")];
+            int32 var_2292 = const()[name = string("op_2292"), val = int32(1)];
+            bool var_2293_interleave_0 = const()[name = string("op_2293_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> key_cache_updates = concat(axis = var_2292, interleave = var_2293_interleave_0, values = (current_key_3_cast_fp16, current_key_7_cast_fp16, current_key_11_cast_fp16, current_key_15_cast_fp16, current_key_cast_fp16))[name = string("op_2293_cast_fp16")];
+            int32 var_2295 = const()[name = string("op_2295"), val = int32(1)];
+            bool var_2296_interleave_0 = const()[name = string("op_2296_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> value_cache_updates = concat(axis = var_2295, interleave = var_2296_interleave_0, values = (current_value_1_cast_fp16, current_value_3_cast_fp16, current_value_5_cast_fp16, current_value_7_cast_fp16, current_value_cast_fp16))[name = string("op_2296_cast_fp16")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [1, 15, 2048]> all_logits = transpose(perm = transpose_0_perm_0, x = var_2290_cast_fp16)[name = string("transpose_0")];
+        } -> (all_logits, key_cache_updates, value_cache_updates);
+}
\ No newline at end of file