diff --git a/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6728082fc1d11a82caf633945e6025ea54abca1d
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbc1ea6ac3fa1e7a07a9e19a4407badc3cdf153bc2382b6a6d408a880f3964c3
+size 243
diff --git a/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/coremldata.bin b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef628a86960e7c47df09d2f5b3b8d4dfe30719e4
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c9e439b240cd4bf60f1a142b4c3d34ea5ae64f57c9e2fd407606cbdae76a4d2
+size 671
diff --git a/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/metadata.json b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..50ed46e4c852a14e35e87158f471b94404040574
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/metadata.json
@@ -0,0 +1,159 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Int32, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 3072)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 3072]",
+        "name" : "logits",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "hidden_states",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 28672 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 1]",
+        "name" : "key_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 28672 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 1]",
+        "name" : "value_cache_updates",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 8,
+      "Ios18.softmax" : 28,
+      "Ios18.mul" : 675,
+      "Ios18.matmul" : 56,
+      "Ios18.rsqrt" : 113,
+      "Ios16.reduceMean" : 113,
+      "Split" : 2,
+      "Ios18.greaterEqual" : 2,
+      "Select" : 2,
+      "Ios18.readState" : 2,
+      "Ios18.gather" : 2,
+      "Ios18.add" : 311,
+      "Ios18.reshape" : 224,
+      "Ios18.constexprLutToDense" : 199,
+      "Ios18.conv" : 197,
+      "Ios18.concat" : 114,
+      "Ios18.cast" : 5,
+      "Ios18.sub" : 1,
+      "Ios18.silu" : 28,
+      "Ios18.transpose" : 1,
+      "Ios18.sliceByIndex" : 560,
+      "Ios18.squeeze" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+      {
+        "dataType" : "Float16",
+        "isOptional" : "0",
+        "formattedType" : "State (Float16 1 × 28672 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 256]",
+        "name" : "self_attn_key_cache",
+        "type" : "State"
+      },
+      {
+        "dataType" : "Float16",
+        "isOptional" : "0",
+        "formattedType" : "State (Float16 1 × 28672 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 256]",
+        "name" : "self_attn_value_cache",
+        "type" : "State"
+      }
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-17",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "kv_cache_update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "key_padding_mask",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "CodeDecoderWithStatefulSelfAttention_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/model.mil b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..e05f2fafecad5facb7d3418c5689350bc9144cc3
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/model.mil
@@ -0,0 +1,6532 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> cache_length, tensor<fp16, [1, 1024, 1, 1]> input_embeds, tensor<fp16, [1, 256]> key_padding_mask, tensor<fp16, [1, 256]> kv_cache_update_mask, state<tensor<fp16, [1, 28672, 1, 256]>> self_attn_key_cache, state<tensor<fp16, [1, 28672, 1, 256]>> self_attn_value_cache) {
+            int32 pos_cos_batch_dims_0 = const()[name = string("pos_cos_batch_dims_0"), val = int32(0)];
+            bool pos_cos_validate_indices_0 = const()[name = string("pos_cos_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 128]> position_embeddings_cos_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [256, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32896))))[name = string("position_embeddings_cos_weight_to_fp16_palettized")];
+            string cache_length_to_int16_dtype_0 = const()[name = string("cache_length_to_int16_dtype_0"), val = string("int16")];
+            string cast_572_dtype_0 = const()[name = string("cast_572_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> cache_length_to_int16 = cast(dtype = cache_length_to_int16_dtype_0, x = cache_length)[name = string("cast_5")];
+            tensor<int32, [1]> cast_572 = cast(dtype = cast_572_dtype_0, x = cache_length_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_572, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(256)];
+            tensor<int32, [1]> add_0 = add(x = cast_572, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_572, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            string cast_0_dtype_0 = const()[name = string("cast_0_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<int32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = select_0_to_int16)[name = string("cast_2")];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = cast_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(256)];
+            tensor<int32, [1]> add_0_1 = add(x = cast_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = cast_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0 = const()[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 128]> pos_cos_cast_fp16_cast_uint16_cast_uint16 = gather(axis = pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0, batch_dims = pos_cos_batch_dims_0, indices = select_0_1, validate_indices = pos_cos_validate_indices_0, x = position_embeddings_cos_weight_to_fp16_palettized)[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> obj_7_axes_0 = const()[name = string("obj_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_7_cast_fp16 = expand_dims(axes = obj_7_axes_0, x = pos_cos_cast_fp16_cast_uint16_cast_uint16)[name = string("obj_7_cast_fp16")];
+            int32 pos_sin_axis_0 = const()[name = string("pos_sin_axis_0"), val = int32(0)];
+            int32 pos_sin_batch_dims_0 = const()[name = string("pos_sin_batch_dims_0"), val = int32(0)];
+            bool pos_sin_validate_indices_0 = const()[name = string("pos_sin_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 128]> position_embeddings_sin_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [256, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33472))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66304))))[name = string("position_embeddings_sin_weight_to_fp16_palettized")];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_1")];
+            tensor<fp16, [1, 128]> pos_sin_cast_fp16_cast_uint16 = gather(axis = pos_sin_axis_0, batch_dims = pos_sin_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_validate_indices_0, x = position_embeddings_sin_weight_to_fp16_palettized)[name = string("pos_sin_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> obj_9_axes_0 = const()[name = string("obj_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_9_cast_fp16 = expand_dims(axes = obj_9_axes_0, x = pos_sin_cast_fp16_cast_uint16)[name = string("obj_9_cast_fp16")];
+            tensor<fp16, [1, 28672, 1, 256]> read_state_0 = read_state(input = self_attn_key_cache)[name = string("read_state_0")];
+            tensor<int32, [28]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [28]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66880)))];
+            int32 var_101_axis_0 = const()[name = string("op_101_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_7, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_8, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_9, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_10, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_11, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_12, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_13, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_14, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_15, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_16, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_17, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_18, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_19, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_20, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_21, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_22, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_23, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_24, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_25, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_26, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_27 = split(axis = var_101_axis_0, split_sizes = tile_0, x = read_state_0)[name = string("op_101_cast_fp16")];
+            tensor<fp16, [1, 28672, 1, 256]> read_state_1 = read_state(input = self_attn_value_cache)[name = string("read_state_1")];
+            tensor<int32, [28]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [28]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67072)))];
+            int32 var_132_axis_0 = const()[name = string("op_132_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_7, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_8, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_9, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_10, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_11, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_12, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_13, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_14, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_15, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_16, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_17, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_18, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_19, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_20, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_21, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_22, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_23, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_24, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_25, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_26, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_27 = split(axis = var_132_axis_0, split_sizes = tile_1, x = read_state_1)[name = string("op_132_cast_fp16")];
+            int32 var_162 = const()[name = string("op_162"), val = int32(3)];
+            int32 var_172 = const()[name = string("op_172"), val = int32(-2)];
+            int32 var_180 = const()[name = string("op_180"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = input_embeds, y = input_embeds)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_192_to_fp16 = const()[name = string("op_192_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_193_cast_fp16 = add(x = variance_1_cast_fp16, y = var_192_to_fp16)[name = string("op_193_cast_fp16")];
+            fp32 var_194_epsilon_0 = const()[name = string("op_194_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_194_cast_fp16 = rsqrt(epsilon = var_194_epsilon_0, x = var_193_cast_fp16)[name = string("op_194_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_1_cast_fp16 = mul(x = input_embeds, y = var_194_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67264)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_1_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("obj_1_cast_fp16")];
+            string query_1_pad_type_0 = const()[name = string("query_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_1_strides_0 = const()[name = string("query_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_1_pad_0 = const()[name = string("query_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_1_dilations_0 = const()[name = string("query_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_1_groups_0 = const()[name = string("query_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69376))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2166592))))[name = string("layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> layers_0_self_attn_q_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_q_proj_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2167168)))];
+            tensor<fp16, [1, 2048, 1, 1]> query_1_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_1_dilations_0, groups = query_1_groups_0, pad = query_1_pad_0, pad_type = query_1_pad_type_0, strides = query_1_strides_0, weight = layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("query_1_cast_fp16")];
+            string current_key_1_pad_type_0 = const()[name = string("current_key_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_1_strides_0 = const()[name = string("current_key_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_1_pad_0 = const()[name = string("current_key_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_1_dilations_0 = const()[name = string("current_key_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_1_groups_0 = const()[name = string("current_key_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2171328))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3219968))))[name = string("layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = conv(dilations = current_key_1_dilations_0, groups = current_key_1_groups_0, pad = current_key_1_pad_0, pad_type = current_key_1_pad_type_0, strides = current_key_1_strides_0, weight = layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            string current_value_1_pad_type_0 = const()[name = string("current_value_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_1_strides_0 = const()[name = string("current_value_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_1_pad_0 = const()[name = string("current_value_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_1_dilations_0 = const()[name = string("current_value_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_1_groups_0 = const()[name = string("current_value_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3220544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4269184))))[name = string("layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> layers_0_self_attn_v_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_v_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4269760)))];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_1_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_1_dilations_0, groups = current_value_1_groups_0, pad = current_value_1_pad_0, pad_type = current_value_1_pad_type_0, strides = current_value_1_strides_0, weight = layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_value_1_cast_fp16")];
+            tensor<int32, [4]> var_231 = const()[name = string("op_231"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_1_cast_fp16 = reshape(shape = var_231, x = query_1_cast_fp16)[name = string("inputs_1_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_237_to_fp16 = const()[name = string("op_237_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_238_cast_fp16 = add(x = variance_3_cast_fp16, y = var_237_to_fp16)[name = string("op_238_cast_fp16")];
+            fp32 var_239_epsilon_0 = const()[name = string("op_239_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_239_cast_fp16 = rsqrt(epsilon = var_239_epsilon_0, x = var_238_cast_fp16)[name = string("op_239_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_239_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4271872)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_1_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("query_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_247 = const()[name = string("op_247"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_3_cast_fp16 = reshape(shape = var_247, x = current_key_1_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_253_to_fp16 = const()[name = string("op_253_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_254_cast_fp16 = add(x = variance_5_cast_fp16, y = var_253_to_fp16)[name = string("op_254_cast_fp16")];
+            fp32 var_255_epsilon_0 = const()[name = string("op_255_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_255_cast_fp16 = rsqrt(epsilon = var_255_epsilon_0, x = var_254_cast_fp16)[name = string("op_255_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_255_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4272192)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_1_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("current_key_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_273 = const()[name = string("op_273"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_1_cast_fp16 = reshape(shape = var_273, x = query_normed_1_cast_fp16)[name = string("mh_q_1_cast_fp16")];
+            tensor<int32, [4]> var_275 = const()[name = string("op_275"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_1_cast_fp16 = reshape(shape = var_275, x = current_key_normed_1_cast_fp16)[name = string("mh_k_1_cast_fp16")];
+            tensor<int32, [1]> cos_1_axes_0 = const()[name = string("cos_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> cos_1_cast_fp16 = expand_dims(axes = cos_1_axes_0, x = obj_7_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [1]> sin_1_axes_0 = const()[name = string("sin_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> sin_1_cast_fp16 = expand_dims(axes = sin_1_axes_0, x = obj_9_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_279_cast_fp16 = mul(x = mh_q_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_279_cast_fp16")];
+            tensor<int32, [4]> var_284_begin_0 = const()[name = string("op_284_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_284_end_0 = const()[name = string("op_284_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_284_end_mask_0 = const()[name = string("op_284_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_284_cast_fp16 = slice_by_index(begin = var_284_begin_0, end = var_284_end_0, end_mask = var_284_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_284_cast_fp16")];
+            tensor<int32, [4]> var_290_begin_0 = const()[name = string("op_290_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_290_end_0 = const()[name = string("op_290_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_290_end_mask_0 = const()[name = string("op_290_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_290_cast_fp16 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_290_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_292_cast_fp16 = mul(x = var_290_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_292_cast_fp16")];
+            bool var_294_interleave_0 = const()[name = string("op_294_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_294_cast_fp16 = concat(axis = var_172, interleave = var_294_interleave_0, values = (var_292_cast_fp16, var_284_cast_fp16))[name = string("op_294_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_295_cast_fp16 = mul(x = var_294_cast_fp16, y = sin_1_cast_fp16)[name = string("op_295_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_3_cast_fp16 = add(x = var_279_cast_fp16, y = var_295_cast_fp16)[name = string("mh_q_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_297_cast_fp16 = mul(x = mh_k_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_297_cast_fp16")];
+            tensor<int32, [4]> var_302_begin_0 = const()[name = string("op_302_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_302_end_0 = const()[name = string("op_302_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_302_end_mask_0 = const()[name = string("op_302_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_302_cast_fp16 = slice_by_index(begin = var_302_begin_0, end = var_302_end_0, end_mask = var_302_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_302_cast_fp16")];
+            tensor<int32, [4]> var_308_begin_0 = const()[name = string("op_308_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_308_end_0 = const()[name = string("op_308_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_308_end_mask_0 = const()[name = string("op_308_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_308_cast_fp16 = slice_by_index(begin = var_308_begin_0, end = var_308_end_0, end_mask = var_308_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_308_cast_fp16")];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_310_cast_fp16 = mul(x = var_308_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_310_cast_fp16")];
+            bool var_312_interleave_0 = const()[name = string("op_312_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_312_cast_fp16 = concat(axis = var_172, interleave = var_312_interleave_0, values = (var_310_cast_fp16, var_302_cast_fp16))[name = string("op_312_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_313_cast_fp16 = mul(x = var_312_cast_fp16, y = sin_1_cast_fp16)[name = string("op_313_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_3_cast_fp16 = add(x = var_297_cast_fp16, y = var_313_cast_fp16)[name = string("mh_k_3_cast_fp16")];
+            tensor<int32, [4]> var_317 = const()[name = string("op_317"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_317, x = mh_k_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<int32, [1]> var_320_axes_0 = const()[name = string("op_320_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_320_cast_fp16 = expand_dims(axes = var_320_axes_0, x = kv_cache_update_mask)[name = string("op_320_cast_fp16")];
+            tensor<int32, [1]> var_321_axes_0 = const()[name = string("op_321_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> var_321_cast_fp16 = expand_dims(axes = var_321_axes_0, x = var_320_cast_fp16)[name = string("op_321_cast_fp16")];
+            fp16 var_173_to_fp16 = const()[name = string("op_173_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 256]> var_323_cast_fp16 = sub(x = var_173_to_fp16, y = var_321_cast_fp16)[name = string("op_323_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_324_cast_fp16 = mul(x = var_101_cast_fp16_0, y = var_323_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_325_cast_fp16 = mul(x = current_key_3_cast_fp16, y = var_321_cast_fp16)[name = string("op_325_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_3_cast_fp16 = add(x = var_324_cast_fp16, y = var_325_cast_fp16)[name = string("key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_328_cast_fp16 = mul(x = var_132_cast_fp16_0, y = var_323_cast_fp16)[name = string("op_328_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_329_cast_fp16 = mul(x = current_value_1_cast_fp16, y = var_321_cast_fp16)[name = string("op_329_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_1_cast_fp16 = add(x = var_328_cast_fp16, y = var_329_cast_fp16)[name = string("value_1_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_1_cast_fp16 = reshape(shape = var_333, x = key_3_cast_fp16)[name = string("key_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_1_cast_fp16 = reshape(shape = var_335, x = value_1_cast_fp16)[name = string("value_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_338_begin_0 = const()[name = string("op_338_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_338_end_0 = const()[name = string("op_338_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_338_end_mask_0 = const()[name = string("op_338_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_338_cast_fp16 = slice_by_index(begin = var_338_begin_0, end = var_338_end_0, end_mask = var_338_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_338_cast_fp16")];
+            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_342_cast_fp16")];
+            tensor<int32, [4]> var_354_begin_0 = const()[name = string("op_354_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_354_end_0 = const()[name = string("op_354_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_354_end_mask_0 = const()[name = string("op_354_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_354_cast_fp16 = slice_by_index(begin = var_354_begin_0, end = var_354_end_0, end_mask = var_354_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_354_cast_fp16")];
+            tensor<int32, [4]> var_358_begin_0 = const()[name = string("op_358_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_358_end_0 = const()[name = string("op_358_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_358_end_mask_0 = const()[name = string("op_358_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_358_cast_fp16 = slice_by_index(begin = var_358_begin_0, end = var_358_end_0, end_mask = var_358_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_358_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_386_cast_fp16 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_390_begin_0 = const()[name = string("op_390_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_390_end_0 = const()[name = string("op_390_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_390_end_mask_0 = const()[name = string("op_390_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_390_cast_fp16 = slice_by_index(begin = var_390_begin_0, end = var_390_end_0, end_mask = var_390_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_390_cast_fp16")];
+            tensor<int32, [4]> var_402_begin_0 = const()[name = string("op_402_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_402_end_0 = const()[name = string("op_402_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_402_end_mask_0 = const()[name = string("op_402_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_402_cast_fp16 = slice_by_index(begin = var_402_begin_0, end = var_402_end_0, end_mask = var_402_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_402_cast_fp16")];
+            tensor<int32, [4]> var_406_begin_0 = const()[name = string("op_406_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_406_end_0 = const()[name = string("op_406_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_406_end_mask_0 = const()[name = string("op_406_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_406_cast_fp16 = slice_by_index(begin = var_406_begin_0, end = var_406_end_0, end_mask = var_406_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_406_cast_fp16")];
+            tensor<int32, [4]> var_418_begin_0 = const()[name = string("op_418_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_418_end_0 = const()[name = string("op_418_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_418_end_mask_0 = const()[name = string("op_418_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_418_cast_fp16 = slice_by_index(begin = var_418_begin_0, end = var_418_end_0, end_mask = var_418_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_418_cast_fp16")];
+            tensor<int32, [4]> var_422_begin_0 = const()[name = string("op_422_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_422_end_0 = const()[name = string("op_422_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_422_end_mask_0 = const()[name = string("op_422_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_422_cast_fp16 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_422_cast_fp16")];
+            tensor<int32, [4]> var_434_begin_0 = const()[name = string("op_434_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_434_end_0 = const()[name = string("op_434_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_434_end_mask_0 = const()[name = string("op_434_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_434_cast_fp16 = slice_by_index(begin = var_434_begin_0, end = var_434_end_0, end_mask = var_434_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_434_cast_fp16")];
+            tensor<int32, [4]> var_438_begin_0 = const()[name = string("op_438_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_438_end_0 = const()[name = string("op_438_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_438_end_mask_0 = const()[name = string("op_438_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_438_cast_fp16 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_438_cast_fp16")];
+            tensor<int32, [4]> var_450_begin_0 = const()[name = string("op_450_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_450_end_0 = const()[name = string("op_450_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_450_end_mask_0 = const()[name = string("op_450_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_450_cast_fp16 = slice_by_index(begin = var_450_begin_0, end = var_450_end_0, end_mask = var_450_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_450_cast_fp16")];
+            tensor<int32, [4]> var_454_begin_0 = const()[name = string("op_454_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_454_end_0 = const()[name = string("op_454_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_454_end_mask_0 = const()[name = string("op_454_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_454_cast_fp16 = slice_by_index(begin = var_454_begin_0, end = var_454_end_0, end_mask = var_454_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_454_cast_fp16")];
+            bool key_heads_3_interleave_0 = const()[name = string("key_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_3_cast_fp16 = concat(axis = var_180, interleave = key_heads_3_interleave_0, values = (var_338_cast_fp16, var_338_cast_fp16, var_354_cast_fp16, var_354_cast_fp16, var_370_cast_fp16, var_370_cast_fp16, var_386_cast_fp16, var_386_cast_fp16, var_402_cast_fp16, var_402_cast_fp16, var_418_cast_fp16, var_418_cast_fp16, var_434_cast_fp16, var_434_cast_fp16, var_450_cast_fp16, var_450_cast_fp16))[name = string("key_heads_3_cast_fp16")];
+            bool value_heads_3_interleave_0 = const()[name = string("value_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_3_cast_fp16 = concat(axis = var_180, interleave = value_heads_3_interleave_0, values = (var_342_cast_fp16, var_342_cast_fp16, var_358_cast_fp16, var_358_cast_fp16, var_374_cast_fp16, var_374_cast_fp16, var_390_cast_fp16, var_390_cast_fp16, var_406_cast_fp16, var_406_cast_fp16, var_422_cast_fp16, var_422_cast_fp16, var_438_cast_fp16, var_438_cast_fp16, var_454_cast_fp16, var_454_cast_fp16))[name = string("value_heads_3_cast_fp16")];
+            fp16 var_477_to_fp16 = const()[name = string("op_477_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_478_cast_fp16 = mul(x = mh_q_3_cast_fp16, y = var_477_to_fp16)[name = string("op_478_cast_fp16")];
+            bool mh_w_1_transpose_x_0 = const()[name = string("mh_w_1_transpose_x_0"), val = bool(true)];
+            bool mh_w_1_transpose_y_0 = const()[name = string("mh_w_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_1_cast_fp16 = matmul(transpose_x = mh_w_1_transpose_x_0, transpose_y = mh_w_1_transpose_y_0, x = var_478_cast_fp16, y = key_heads_3_cast_fp16)[name = string("mh_w_1_cast_fp16")];
+            tensor<int32, [1]> var_486_axes_0 = const()[name = string("op_486_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_486_cast_fp16 = expand_dims(axes = var_486_axes_0, x = key_padding_mask)[name = string("op_486_cast_fp16")];
+            tensor<int32, [1]> var_487_axes_0 = const()[name = string("op_487_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> var_487_cast_fp16 = expand_dims(axes = var_487_axes_0, x = var_486_cast_fp16)[name = string("op_487_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_3_cast_fp16 = add(x = mh_w_1_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_3_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_490_cast_fp16 = softmax(axis = var_162, x = mh_w_3_cast_fp16)[name = string("op_490_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = value_heads_3_cast_fp16, y = var_490_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_495 = const()[name = string("op_495"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = reshape(shape = var_495, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            string obj_11_pad_type_0 = const()[name = string("obj_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_11_strides_0 = const()[name = string("obj_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_11_pad_0 = const()[name = string("obj_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_11_dilations_0 = const()[name = string("obj_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_11_groups_0 = const()[name = string("obj_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4272512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6369728))))[name = string("layers_0_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_11_dilations_0, groups = obj_11_groups_0, pad = obj_11_pad_0, pad_type = obj_11_pad_type_0, strides = obj_11_strides_0, weight = layers_0_self_attn_o_proj_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("obj_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_5_cast_fp16 = add(x = input_embeds, y = obj_11_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_513_to_fp16 = const()[name = string("op_513_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_514_cast_fp16 = add(x = variance_7_cast_fp16, y = var_513_to_fp16)[name = string("op_514_cast_fp16")];
+            fp32 var_515_epsilon_0 = const()[name = string("op_515_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_515_cast_fp16 = rsqrt(epsilon = var_515_epsilon_0, x = var_514_cast_fp16)[name = string("op_515_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_515_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6370304)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_3_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_3_cast_fp16")];
+            string input_5_pad_type_0 = const()[name = string("input_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_5_strides_0 = const()[name = string("input_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_5_pad_0 = const()[name = string("input_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_5_dilations_0 = const()[name = string("input_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_5_groups_0 = const()[name = string("input_5_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6372416))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9518208))))[name = string("layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_5_cast_fp16 = conv(dilations = input_5_dilations_0, groups = input_5_groups_0, pad = input_5_pad_0, pad_type = input_5_pad_type_0, strides = input_5_strides_0, weight = layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_529_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_529_cast_fp16")];
+            string var_535_pad_type_0 = const()[name = string("op_535_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_535_strides_0 = const()[name = string("op_535_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_535_pad_0 = const()[name = string("op_535_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_535_dilations_0 = const()[name = string("op_535_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_535_groups_0 = const()[name = string("op_535_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9518784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12664576))))[name = string("layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_535_cast_fp16 = conv(dilations = var_535_dilations_0, groups = var_535_groups_0, pad = var_535_pad_0, pad_type = var_535_pad_type_0, strides = var_535_strides_0, weight = layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("op_535_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_7_cast_fp16 = mul(x = var_529_cast_fp16, y = var_535_cast_fp16)[name = string("input_7_cast_fp16")];
+            string hidden_states_9_pad_type_0 = const()[name = string("hidden_states_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_9_strides_0 = const()[name = string("hidden_states_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_9_pad_0 = const()[name = string("hidden_states_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_9_dilations_0 = const()[name = string("hidden_states_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_9_groups_0 = const()[name = string("hidden_states_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_0_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12665152))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15810944))))[name = string("layers_0_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_9_cast_fp16 = conv(dilations = hidden_states_9_dilations_0, groups = hidden_states_9_groups_0, pad = hidden_states_9_pad_0, pad_type = hidden_states_9_pad_type_0, strides = hidden_states_9_strides_0, weight = layers_0_mlp_down_proj_weight_to_fp16_palettized, x = input_7_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            int32 var_549 = const()[name = string("op_549"), val = int32(3)];
+            int32 var_559 = const()[name = string("op_559"), val = int32(-2)];
+            int32 var_567 = const()[name = string("op_567"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_579_to_fp16 = const()[name = string("op_579_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_580_cast_fp16 = add(x = variance_9_cast_fp16, y = var_579_to_fp16)[name = string("op_580_cast_fp16")];
+            fp32 var_581_epsilon_0 = const()[name = string("op_581_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_581_cast_fp16 = rsqrt(epsilon = var_581_epsilon_0, x = var_580_cast_fp16)[name = string("op_581_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_581_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15811520)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_13_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_11_cast_fp16)[name = string("obj_13_cast_fp16")];
+            string query_7_pad_type_0 = const()[name = string("query_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_7_strides_0 = const()[name = string("query_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_7_pad_0 = const()[name = string("query_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_7_dilations_0 = const()[name = string("query_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_7_groups_0 = const()[name = string("query_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15813632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17910848))))[name = string("layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_7_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_7_dilations_0, groups = query_7_groups_0, pad = query_7_pad_0, pad_type = query_7_pad_type_0, strides = query_7_strides_0, weight = layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("query_7_cast_fp16")];
+            string current_key_5_pad_type_0 = const()[name = string("current_key_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_5_strides_0 = const()[name = string("current_key_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_5_pad_0 = const()[name = string("current_key_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_5_dilations_0 = const()[name = string("current_key_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_5_groups_0 = const()[name = string("current_key_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17911424))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18960064))))[name = string("layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = conv(dilations = current_key_5_dilations_0, groups = current_key_5_groups_0, pad = current_key_5_pad_0, pad_type = current_key_5_pad_type_0, strides = current_key_5_strides_0, weight = layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            string current_value_3_pad_type_0 = const()[name = string("current_value_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_3_strides_0 = const()[name = string("current_value_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_3_pad_0 = const()[name = string("current_value_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_3_dilations_0 = const()[name = string("current_value_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_3_groups_0 = const()[name = string("current_value_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18960640))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20009280))))[name = string("layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_3_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_3_dilations_0, groups = current_value_3_groups_0, pad = current_value_3_pad_0, pad_type = current_value_3_pad_type_0, strides = current_value_3_strides_0, weight = layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_value_3_cast_fp16")];
+            tensor<int32, [4]> var_618 = const()[name = string("op_618"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_9_cast_fp16 = reshape(shape = var_618, x = query_7_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_624_to_fp16 = const()[name = string("op_624_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_625_cast_fp16 = add(x = variance_11_cast_fp16, y = var_624_to_fp16)[name = string("op_625_cast_fp16")];
+            fp32 var_626_epsilon_0 = const()[name = string("op_626_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_626_cast_fp16 = rsqrt(epsilon = var_626_epsilon_0, x = var_625_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_626_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20009856)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_3_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_13_cast_fp16)[name = string("query_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_634 = const()[name = string("op_634"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_11_cast_fp16 = reshape(shape = var_634, x = current_key_5_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_640_to_fp16 = const()[name = string("op_640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_641_cast_fp16 = add(x = variance_13_cast_fp16, y = var_640_to_fp16)[name = string("op_641_cast_fp16")];
+            fp32 var_642_epsilon_0 = const()[name = string("op_642_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_642_cast_fp16 = rsqrt(epsilon = var_642_epsilon_0, x = var_641_cast_fp16)[name = string("op_642_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_642_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20010176)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_3_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_15_cast_fp16)[name = string("current_key_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_660 = const()[name = string("op_660"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_7_cast_fp16 = reshape(shape = var_660, x = query_normed_3_cast_fp16)[name = string("mh_q_7_cast_fp16")];
+            tensor<int32, [4]> var_662 = const()[name = string("op_662"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_5_cast_fp16 = reshape(shape = var_662, x = current_key_normed_3_cast_fp16)[name = string("mh_k_5_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_666_cast_fp16 = mul(x = mh_q_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_666_cast_fp16")];
+            tensor<int32, [4]> var_671_begin_0 = const()[name = string("op_671_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_671_end_0 = const()[name = string("op_671_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_671_end_mask_0 = const()[name = string("op_671_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_671_cast_fp16 = slice_by_index(begin = var_671_begin_0, end = var_671_end_0, end_mask = var_671_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_671_cast_fp16")];
+            tensor<int32, [4]> var_677_begin_0 = const()[name = string("op_677_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_677_end_0 = const()[name = string("op_677_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_677_end_mask_0 = const()[name = string("op_677_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_677_cast_fp16 = slice_by_index(begin = var_677_begin_0, end = var_677_end_0, end_mask = var_677_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_677_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_679_cast_fp16 = mul(x = var_677_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_679_cast_fp16")];
+            bool var_681_interleave_0 = const()[name = string("op_681_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_681_cast_fp16 = concat(axis = var_559, interleave = var_681_interleave_0, values = (var_679_cast_fp16, var_671_cast_fp16))[name = string("op_681_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_682_cast_fp16 = mul(x = var_681_cast_fp16, y = sin_1_cast_fp16)[name = string("op_682_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_9_cast_fp16 = add(x = var_666_cast_fp16, y = var_682_cast_fp16)[name = string("mh_q_9_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_684_cast_fp16 = mul(x = mh_k_5_cast_fp16, y = cos_1_cast_fp16)[name = string("op_684_cast_fp16")];
+            tensor<int32, [4]> var_689_begin_0 = const()[name = string("op_689_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_689_end_0 = const()[name = string("op_689_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_689_end_mask_0 = const()[name = string("op_689_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_689_cast_fp16 = slice_by_index(begin = var_689_begin_0, end = var_689_end_0, end_mask = var_689_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_689_cast_fp16")];
+            tensor<int32, [4]> var_695_begin_0 = const()[name = string("op_695_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_695_end_0 = const()[name = string("op_695_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_695_end_mask_0 = const()[name = string("op_695_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_695_cast_fp16 = slice_by_index(begin = var_695_begin_0, end = var_695_end_0, end_mask = var_695_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_695_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_697_cast_fp16 = mul(x = var_695_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_697_cast_fp16")];
+            bool var_699_interleave_0 = const()[name = string("op_699_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_699_cast_fp16 = concat(axis = var_559, interleave = var_699_interleave_0, values = (var_697_cast_fp16, var_689_cast_fp16))[name = string("op_699_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_700_cast_fp16 = mul(x = var_699_cast_fp16, y = sin_1_cast_fp16)[name = string("op_700_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_7_cast_fp16 = add(x = var_684_cast_fp16, y = var_700_cast_fp16)[name = string("mh_k_7_cast_fp16")];
+            tensor<int32, [4]> var_704 = const()[name = string("op_704"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_704, x = mh_k_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_711_cast_fp16 = mul(x = var_101_cast_fp16_1, y = var_323_cast_fp16)[name = string("op_711_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_712_cast_fp16 = mul(x = current_key_7_cast_fp16, y = var_321_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_9_cast_fp16 = add(x = var_711_cast_fp16, y = var_712_cast_fp16)[name = string("key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_715_cast_fp16 = mul(x = var_132_cast_fp16_1, y = var_323_cast_fp16)[name = string("op_715_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_716_cast_fp16 = mul(x = current_value_3_cast_fp16, y = var_321_cast_fp16)[name = string("op_716_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_5_cast_fp16 = add(x = var_715_cast_fp16, y = var_716_cast_fp16)[name = string("value_5_cast_fp16")];
+            tensor<int32, [4]> var_720 = const()[name = string("op_720"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_5_cast_fp16 = reshape(shape = var_720, x = key_9_cast_fp16)[name = string("key_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_722 = const()[name = string("op_722"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_5_cast_fp16 = reshape(shape = var_722, x = value_5_cast_fp16)[name = string("value_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_725_begin_0 = const()[name = string("op_725_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_725_end_0 = const()[name = string("op_725_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_725_end_mask_0 = const()[name = string("op_725_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_725_cast_fp16 = slice_by_index(begin = var_725_begin_0, end = var_725_end_0, end_mask = var_725_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_725_cast_fp16")];
+            tensor<int32, [4]> var_729_begin_0 = const()[name = string("op_729_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_729_end_0 = const()[name = string("op_729_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_729_end_mask_0 = const()[name = string("op_729_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_729_cast_fp16 = slice_by_index(begin = var_729_begin_0, end = var_729_end_0, end_mask = var_729_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_729_cast_fp16")];
+            tensor<int32, [4]> var_741_begin_0 = const()[name = string("op_741_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_741_end_0 = const()[name = string("op_741_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_741_end_mask_0 = const()[name = string("op_741_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_741_cast_fp16 = slice_by_index(begin = var_741_begin_0, end = var_741_end_0, end_mask = var_741_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_741_cast_fp16")];
+            tensor<int32, [4]> var_745_begin_0 = const()[name = string("op_745_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_745_end_0 = const()[name = string("op_745_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_745_end_mask_0 = const()[name = string("op_745_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_745_cast_fp16 = slice_by_index(begin = var_745_begin_0, end = var_745_end_0, end_mask = var_745_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_745_cast_fp16")];
+            tensor<int32, [4]> var_757_begin_0 = const()[name = string("op_757_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_757_end_0 = const()[name = string("op_757_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_757_end_mask_0 = const()[name = string("op_757_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_757_cast_fp16 = slice_by_index(begin = var_757_begin_0, end = var_757_end_0, end_mask = var_757_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_757_cast_fp16")];
+            tensor<int32, [4]> var_761_begin_0 = const()[name = string("op_761_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_761_end_0 = const()[name = string("op_761_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_761_end_mask_0 = const()[name = string("op_761_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_761_cast_fp16 = slice_by_index(begin = var_761_begin_0, end = var_761_end_0, end_mask = var_761_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_761_cast_fp16")];
+            tensor<int32, [4]> var_773_begin_0 = const()[name = string("op_773_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_773_end_0 = const()[name = string("op_773_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_773_end_mask_0 = const()[name = string("op_773_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_773_cast_fp16 = slice_by_index(begin = var_773_begin_0, end = var_773_end_0, end_mask = var_773_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_773_cast_fp16")];
+            tensor<int32, [4]> var_777_begin_0 = const()[name = string("op_777_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_777_end_0 = const()[name = string("op_777_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_777_end_mask_0 = const()[name = string("op_777_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_777_cast_fp16 = slice_by_index(begin = var_777_begin_0, end = var_777_end_0, end_mask = var_777_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_777_cast_fp16")];
+            tensor<int32, [4]> var_789_begin_0 = const()[name = string("op_789_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_789_end_0 = const()[name = string("op_789_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_789_end_mask_0 = const()[name = string("op_789_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_789_cast_fp16 = slice_by_index(begin = var_789_begin_0, end = var_789_end_0, end_mask = var_789_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_789_cast_fp16")];
+            tensor<int32, [4]> var_793_begin_0 = const()[name = string("op_793_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_793_end_0 = const()[name = string("op_793_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_793_end_mask_0 = const()[name = string("op_793_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_793_cast_fp16 = slice_by_index(begin = var_793_begin_0, end = var_793_end_0, end_mask = var_793_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_793_cast_fp16")];
+            tensor<int32, [4]> var_805_begin_0 = const()[name = string("op_805_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_805_end_0 = const()[name = string("op_805_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_805_end_mask_0 = const()[name = string("op_805_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_805_cast_fp16 = slice_by_index(begin = var_805_begin_0, end = var_805_end_0, end_mask = var_805_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_805_cast_fp16")];
+            tensor<int32, [4]> var_809_begin_0 = const()[name = string("op_809_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_809_end_0 = const()[name = string("op_809_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_809_end_mask_0 = const()[name = string("op_809_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_809_cast_fp16 = slice_by_index(begin = var_809_begin_0, end = var_809_end_0, end_mask = var_809_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_809_cast_fp16")];
+            tensor<int32, [4]> var_821_begin_0 = const()[name = string("op_821_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_821_end_0 = const()[name = string("op_821_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_821_end_mask_0 = const()[name = string("op_821_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_821_cast_fp16 = slice_by_index(begin = var_821_begin_0, end = var_821_end_0, end_mask = var_821_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_821_cast_fp16")];
+            tensor<int32, [4]> var_825_begin_0 = const()[name = string("op_825_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_825_end_0 = const()[name = string("op_825_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_825_end_mask_0 = const()[name = string("op_825_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_825_cast_fp16 = slice_by_index(begin = var_825_begin_0, end = var_825_end_0, end_mask = var_825_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_825_cast_fp16")];
+            tensor<int32, [4]> var_837_begin_0 = const()[name = string("op_837_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_837_end_0 = const()[name = string("op_837_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_837_end_mask_0 = const()[name = string("op_837_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_837_cast_fp16 = slice_by_index(begin = var_837_begin_0, end = var_837_end_0, end_mask = var_837_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_837_cast_fp16")];
+            tensor<int32, [4]> var_841_begin_0 = const()[name = string("op_841_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_841_end_0 = const()[name = string("op_841_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_841_end_mask_0 = const()[name = string("op_841_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_841_cast_fp16 = slice_by_index(begin = var_841_begin_0, end = var_841_end_0, end_mask = var_841_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_841_cast_fp16")];
+            bool key_heads_7_interleave_0 = const()[name = string("key_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_7_cast_fp16 = concat(axis = var_567, interleave = key_heads_7_interleave_0, values = (var_725_cast_fp16, var_725_cast_fp16, var_741_cast_fp16, var_741_cast_fp16, var_757_cast_fp16, var_757_cast_fp16, var_773_cast_fp16, var_773_cast_fp16, var_789_cast_fp16, var_789_cast_fp16, var_805_cast_fp16, var_805_cast_fp16, var_821_cast_fp16, var_821_cast_fp16, var_837_cast_fp16, var_837_cast_fp16))[name = string("key_heads_7_cast_fp16")];
+            bool value_heads_7_interleave_0 = const()[name = string("value_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_7_cast_fp16 = concat(axis = var_567, interleave = value_heads_7_interleave_0, values = (var_729_cast_fp16, var_729_cast_fp16, var_745_cast_fp16, var_745_cast_fp16, var_761_cast_fp16, var_761_cast_fp16, var_777_cast_fp16, var_777_cast_fp16, var_793_cast_fp16, var_793_cast_fp16, var_809_cast_fp16, var_809_cast_fp16, var_825_cast_fp16, var_825_cast_fp16, var_841_cast_fp16, var_841_cast_fp16))[name = string("value_heads_7_cast_fp16")];
+            fp16 var_864_to_fp16 = const()[name = string("op_864_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_865_cast_fp16 = mul(x = mh_q_9_cast_fp16, y = var_864_to_fp16)[name = string("op_865_cast_fp16")];
+            bool mh_w_5_transpose_x_0 = const()[name = string("mh_w_5_transpose_x_0"), val = bool(true)];
+            bool mh_w_5_transpose_y_0 = const()[name = string("mh_w_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_5_cast_fp16 = matmul(transpose_x = mh_w_5_transpose_x_0, transpose_y = mh_w_5_transpose_y_0, x = var_865_cast_fp16, y = key_heads_7_cast_fp16)[name = string("mh_w_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_7_cast_fp16 = add(x = mh_w_5_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_7_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_877_cast_fp16 = softmax(axis = var_549, x = mh_w_7_cast_fp16)[name = string("op_877_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = value_heads_7_cast_fp16, y = var_877_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_882 = const()[name = string("op_882"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_9_cast_fp16 = reshape(shape = var_882, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            string obj_19_pad_type_0 = const()[name = string("obj_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_19_strides_0 = const()[name = string("obj_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_19_pad_0 = const()[name = string("obj_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_19_dilations_0 = const()[name = string("obj_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_19_groups_0 = const()[name = string("obj_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20010496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22107712))))[name = string("layers_1_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_19_dilations_0, groups = obj_19_groups_0, pad = obj_19_pad_0, pad_type = obj_19_pad_type_0, strides = obj_19_strides_0, weight = layers_1_self_attn_o_proj_weight_to_fp16_palettized, x = input_9_cast_fp16)[name = string("obj_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_13_cast_fp16 = add(x = inputs_7_cast_fp16, y = obj_19_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_900_to_fp16 = const()[name = string("op_900_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_901_cast_fp16 = add(x = variance_15_cast_fp16, y = var_900_to_fp16)[name = string("op_901_cast_fp16")];
+            fp32 var_902_epsilon_0 = const()[name = string("op_902_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_902_cast_fp16 = rsqrt(epsilon = var_902_epsilon_0, x = var_901_cast_fp16)[name = string("op_902_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_902_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22108288)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_11_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_17_cast_fp16)[name = string("input_11_cast_fp16")];
+            string input_13_pad_type_0 = const()[name = string("input_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_13_strides_0 = const()[name = string("input_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_13_pad_0 = const()[name = string("input_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_13_dilations_0 = const()[name = string("input_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_13_groups_0 = const()[name = string("input_13_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22110400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25256192))))[name = string("layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_13_cast_fp16 = conv(dilations = input_13_dilations_0, groups = input_13_groups_0, pad = input_13_pad_0, pad_type = input_13_pad_type_0, strides = input_13_strides_0, weight = layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_916_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_916_cast_fp16")];
+            string var_922_pad_type_0 = const()[name = string("op_922_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_922_strides_0 = const()[name = string("op_922_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_922_pad_0 = const()[name = string("op_922_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_922_dilations_0 = const()[name = string("op_922_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_922_groups_0 = const()[name = string("op_922_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25256768))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28402560))))[name = string("layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_922_cast_fp16 = conv(dilations = var_922_dilations_0, groups = var_922_groups_0, pad = var_922_pad_0, pad_type = var_922_pad_type_0, strides = var_922_strides_0, weight = layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("op_922_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_15_cast_fp16 = mul(x = var_916_cast_fp16, y = var_922_cast_fp16)[name = string("input_15_cast_fp16")];
+            string hidden_states_19_pad_type_0 = const()[name = string("hidden_states_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_19_strides_0 = const()[name = string("hidden_states_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_19_pad_0 = const()[name = string("hidden_states_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_19_dilations_0 = const()[name = string("hidden_states_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_19_groups_0 = const()[name = string("hidden_states_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_1_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28403136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31548928))))[name = string("layers_1_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_19_cast_fp16 = conv(dilations = hidden_states_19_dilations_0, groups = hidden_states_19_groups_0, pad = hidden_states_19_pad_0, pad_type = hidden_states_19_pad_type_0, strides = hidden_states_19_strides_0, weight = layers_1_mlp_down_proj_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            int32 var_936 = const()[name = string("op_936"), val = int32(3)];
+            int32 var_946 = const()[name = string("op_946"), val = int32(-2)];
+            int32 var_954 = const()[name = string("op_954"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_966_to_fp16 = const()[name = string("op_966_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_967_cast_fp16 = add(x = variance_17_cast_fp16, y = var_966_to_fp16)[name = string("op_967_cast_fp16")];
+            fp32 var_968_epsilon_0 = const()[name = string("op_968_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_968_cast_fp16 = rsqrt(epsilon = var_968_epsilon_0, x = var_967_cast_fp16)[name = string("op_968_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_968_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31549504)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_21_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_21_cast_fp16)[name = string("obj_21_cast_fp16")];
+            string query_13_pad_type_0 = const()[name = string("query_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_13_strides_0 = const()[name = string("query_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_13_pad_0 = const()[name = string("query_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_13_dilations_0 = const()[name = string("query_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_13_groups_0 = const()[name = string("query_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31551616))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33648832))))[name = string("layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_13_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_13_dilations_0, groups = query_13_groups_0, pad = query_13_pad_0, pad_type = query_13_pad_type_0, strides = query_13_strides_0, weight = layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("query_13_cast_fp16")];
+            string current_key_9_pad_type_0 = const()[name = string("current_key_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_9_strides_0 = const()[name = string("current_key_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_9_pad_0 = const()[name = string("current_key_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_9_dilations_0 = const()[name = string("current_key_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_9_groups_0 = const()[name = string("current_key_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33649408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34698048))))[name = string("layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = conv(dilations = current_key_9_dilations_0, groups = current_key_9_groups_0, pad = current_key_9_pad_0, pad_type = current_key_9_pad_type_0, strides = current_key_9_strides_0, weight = layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            string current_value_5_pad_type_0 = const()[name = string("current_value_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_5_strides_0 = const()[name = string("current_value_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_5_pad_0 = const()[name = string("current_value_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_5_dilations_0 = const()[name = string("current_value_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_5_groups_0 = const()[name = string("current_value_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34698624))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35747264))))[name = string("layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_5_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_5_dilations_0, groups = current_value_5_groups_0, pad = current_value_5_pad_0, pad_type = current_value_5_pad_type_0, strides = current_value_5_strides_0, weight = layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_value_5_cast_fp16")];
+            tensor<int32, [4]> var_1005 = const()[name = string("op_1005"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_17_cast_fp16 = reshape(shape = var_1005, x = query_13_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_1011_to_fp16 = const()[name = string("op_1011_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1012_cast_fp16 = add(x = variance_19_cast_fp16, y = var_1011_to_fp16)[name = string("op_1012_cast_fp16")];
+            fp32 var_1013_epsilon_0 = const()[name = string("op_1013_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1013_cast_fp16 = rsqrt(epsilon = var_1013_epsilon_0, x = var_1012_cast_fp16)[name = string("op_1013_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_1013_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35747840)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_5_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_23_cast_fp16)[name = string("query_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_1021 = const()[name = string("op_1021"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_19_cast_fp16 = reshape(shape = var_1021, x = current_key_9_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_1027_to_fp16 = const()[name = string("op_1027_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1028_cast_fp16 = add(x = variance_21_cast_fp16, y = var_1027_to_fp16)[name = string("op_1028_cast_fp16")];
+            fp32 var_1029_epsilon_0 = const()[name = string("op_1029_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1029_cast_fp16 = rsqrt(epsilon = var_1029_epsilon_0, x = var_1028_cast_fp16)[name = string("op_1029_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_1029_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35748160)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_5_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_25_cast_fp16)[name = string("current_key_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_1047 = const()[name = string("op_1047"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_13_cast_fp16 = reshape(shape = var_1047, x = query_normed_5_cast_fp16)[name = string("mh_q_13_cast_fp16")];
+            tensor<int32, [4]> var_1049 = const()[name = string("op_1049"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_9_cast_fp16 = reshape(shape = var_1049, x = current_key_normed_5_cast_fp16)[name = string("mh_k_9_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1053_cast_fp16 = mul(x = mh_q_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1053_cast_fp16")];
+            tensor<int32, [4]> var_1058_begin_0 = const()[name = string("op_1058_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1058_end_0 = const()[name = string("op_1058_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1058_end_mask_0 = const()[name = string("op_1058_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1058_cast_fp16 = slice_by_index(begin = var_1058_begin_0, end = var_1058_end_0, end_mask = var_1058_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1058_cast_fp16")];
+            tensor<int32, [4]> var_1064_begin_0 = const()[name = string("op_1064_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1064_end_0 = const()[name = string("op_1064_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1064_end_mask_0 = const()[name = string("op_1064_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1064_cast_fp16 = slice_by_index(begin = var_1064_begin_0, end = var_1064_end_0, end_mask = var_1064_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1064_cast_fp16")];
+            fp16 const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1066_cast_fp16 = mul(x = var_1064_cast_fp16, y = const_63_promoted_to_fp16)[name = string("op_1066_cast_fp16")];
+            bool var_1068_interleave_0 = const()[name = string("op_1068_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1068_cast_fp16 = concat(axis = var_946, interleave = var_1068_interleave_0, values = (var_1066_cast_fp16, var_1058_cast_fp16))[name = string("op_1068_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1069_cast_fp16 = mul(x = var_1068_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1069_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_15_cast_fp16 = add(x = var_1053_cast_fp16, y = var_1069_cast_fp16)[name = string("mh_q_15_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1071_cast_fp16 = mul(x = mh_k_9_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1071_cast_fp16")];
+            tensor<int32, [4]> var_1076_begin_0 = const()[name = string("op_1076_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1076_end_0 = const()[name = string("op_1076_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1076_end_mask_0 = const()[name = string("op_1076_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1076_cast_fp16 = slice_by_index(begin = var_1076_begin_0, end = var_1076_end_0, end_mask = var_1076_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1076_cast_fp16")];
+            tensor<int32, [4]> var_1082_begin_0 = const()[name = string("op_1082_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1082_end_0 = const()[name = string("op_1082_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1082_end_mask_0 = const()[name = string("op_1082_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1082_cast_fp16 = slice_by_index(begin = var_1082_begin_0, end = var_1082_end_0, end_mask = var_1082_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1082_cast_fp16")];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1084_cast_fp16 = mul(x = var_1082_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_1084_cast_fp16")];
+            bool var_1086_interleave_0 = const()[name = string("op_1086_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1086_cast_fp16 = concat(axis = var_946, interleave = var_1086_interleave_0, values = (var_1084_cast_fp16, var_1076_cast_fp16))[name = string("op_1086_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1087_cast_fp16 = mul(x = var_1086_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1087_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_11_cast_fp16 = add(x = var_1071_cast_fp16, y = var_1087_cast_fp16)[name = string("mh_k_11_cast_fp16")];
+            tensor<int32, [4]> var_1091 = const()[name = string("op_1091"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1091, x = mh_k_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1098_cast_fp16 = mul(x = var_101_cast_fp16_2, y = var_323_cast_fp16)[name = string("op_1098_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1099_cast_fp16 = mul(x = current_key_11_cast_fp16, y = var_321_cast_fp16)[name = string("op_1099_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_15_cast_fp16 = add(x = var_1098_cast_fp16, y = var_1099_cast_fp16)[name = string("key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1102_cast_fp16 = mul(x = var_132_cast_fp16_2, y = var_323_cast_fp16)[name = string("op_1102_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1103_cast_fp16 = mul(x = current_value_5_cast_fp16, y = var_321_cast_fp16)[name = string("op_1103_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_9_cast_fp16 = add(x = var_1102_cast_fp16, y = var_1103_cast_fp16)[name = string("value_9_cast_fp16")];
+            tensor<int32, [4]> var_1107 = const()[name = string("op_1107"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_9_cast_fp16 = reshape(shape = var_1107, x = key_15_cast_fp16)[name = string("key_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1109 = const()[name = string("op_1109"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_9_cast_fp16 = reshape(shape = var_1109, x = value_9_cast_fp16)[name = string("value_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = string("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = string("op_1112_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = string("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1116_begin_0 = const()[name = string("op_1116_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1116_end_0 = const()[name = string("op_1116_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1116_end_mask_0 = const()[name = string("op_1116_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1116_cast_fp16 = slice_by_index(begin = var_1116_begin_0, end = var_1116_end_0, end_mask = var_1116_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1116_cast_fp16")];
+            tensor<int32, [4]> var_1128_begin_0 = const()[name = string("op_1128_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1128_end_0 = const()[name = string("op_1128_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1128_end_mask_0 = const()[name = string("op_1128_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1128_cast_fp16 = slice_by_index(begin = var_1128_begin_0, end = var_1128_end_0, end_mask = var_1128_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1128_cast_fp16")];
+            tensor<int32, [4]> var_1132_begin_0 = const()[name = string("op_1132_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1132_end_0 = const()[name = string("op_1132_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1132_end_mask_0 = const()[name = string("op_1132_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1132_cast_fp16 = slice_by_index(begin = var_1132_begin_0, end = var_1132_end_0, end_mask = var_1132_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1132_cast_fp16")];
+            tensor<int32, [4]> var_1144_begin_0 = const()[name = string("op_1144_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1144_end_0 = const()[name = string("op_1144_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1144_end_mask_0 = const()[name = string("op_1144_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1144_cast_fp16 = slice_by_index(begin = var_1144_begin_0, end = var_1144_end_0, end_mask = var_1144_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1144_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = string("op_1148_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = string("op_1148_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = string("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = string("op_1160_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = string("op_1160_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = string("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1164_begin_0 = const()[name = string("op_1164_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1164_end_0 = const()[name = string("op_1164_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1164_end_mask_0 = const()[name = string("op_1164_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1164_cast_fp16 = slice_by_index(begin = var_1164_begin_0, end = var_1164_end_0, end_mask = var_1164_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1164_cast_fp16")];
+            tensor<int32, [4]> var_1176_begin_0 = const()[name = string("op_1176_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1176_end_0 = const()[name = string("op_1176_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1176_end_mask_0 = const()[name = string("op_1176_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1176_cast_fp16 = slice_by_index(begin = var_1176_begin_0, end = var_1176_end_0, end_mask = var_1176_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1176_cast_fp16")];
+            tensor<int32, [4]> var_1180_begin_0 = const()[name = string("op_1180_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1180_end_0 = const()[name = string("op_1180_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1180_end_mask_0 = const()[name = string("op_1180_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1180_cast_fp16 = slice_by_index(begin = var_1180_begin_0, end = var_1180_end_0, end_mask = var_1180_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1180_cast_fp16")];
+            tensor<int32, [4]> var_1192_begin_0 = const()[name = string("op_1192_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1192_end_0 = const()[name = string("op_1192_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1192_end_mask_0 = const()[name = string("op_1192_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1192_cast_fp16 = slice_by_index(begin = var_1192_begin_0, end = var_1192_end_0, end_mask = var_1192_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1192_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = string("op_1196_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = string("op_1196_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = string("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1196_cast_fp16")];
+            tensor<int32, [4]> var_1208_begin_0 = const()[name = string("op_1208_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1208_end_0 = const()[name = string("op_1208_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1208_end_mask_0 = const()[name = string("op_1208_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1208_cast_fp16 = slice_by_index(begin = var_1208_begin_0, end = var_1208_end_0, end_mask = var_1208_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1208_cast_fp16")];
+            tensor<int32, [4]> var_1212_begin_0 = const()[name = string("op_1212_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1212_end_0 = const()[name = string("op_1212_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1212_end_mask_0 = const()[name = string("op_1212_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1212_cast_fp16 = slice_by_index(begin = var_1212_begin_0, end = var_1212_end_0, end_mask = var_1212_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1212_cast_fp16")];
+            tensor<int32, [4]> var_1224_begin_0 = const()[name = string("op_1224_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1224_end_0 = const()[name = string("op_1224_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1224_end_mask_0 = const()[name = string("op_1224_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1224_cast_fp16 = slice_by_index(begin = var_1224_begin_0, end = var_1224_end_0, end_mask = var_1224_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1224_cast_fp16")];
+            tensor<int32, [4]> var_1228_begin_0 = const()[name = string("op_1228_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1228_end_0 = const()[name = string("op_1228_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1228_end_mask_0 = const()[name = string("op_1228_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1228_cast_fp16 = slice_by_index(begin = var_1228_begin_0, end = var_1228_end_0, end_mask = var_1228_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1228_cast_fp16")];
+            bool key_heads_11_interleave_0 = const()[name = string("key_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_11_cast_fp16 = concat(axis = var_954, interleave = key_heads_11_interleave_0, values = (var_1112_cast_fp16, var_1112_cast_fp16, var_1128_cast_fp16, var_1128_cast_fp16, var_1144_cast_fp16, var_1144_cast_fp16, var_1160_cast_fp16, var_1160_cast_fp16, var_1176_cast_fp16, var_1176_cast_fp16, var_1192_cast_fp16, var_1192_cast_fp16, var_1208_cast_fp16, var_1208_cast_fp16, var_1224_cast_fp16, var_1224_cast_fp16))[name = string("key_heads_11_cast_fp16")];
+            bool value_heads_11_interleave_0 = const()[name = string("value_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_11_cast_fp16 = concat(axis = var_954, interleave = value_heads_11_interleave_0, values = (var_1116_cast_fp16, var_1116_cast_fp16, var_1132_cast_fp16, var_1132_cast_fp16, var_1148_cast_fp16, var_1148_cast_fp16, var_1164_cast_fp16, var_1164_cast_fp16, var_1180_cast_fp16, var_1180_cast_fp16, var_1196_cast_fp16, var_1196_cast_fp16, var_1212_cast_fp16, var_1212_cast_fp16, var_1228_cast_fp16, var_1228_cast_fp16))[name = string("value_heads_11_cast_fp16")];
+            fp16 var_1251_to_fp16 = const()[name = string("op_1251_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1252_cast_fp16 = mul(x = mh_q_15_cast_fp16, y = var_1251_to_fp16)[name = string("op_1252_cast_fp16")];
+            bool mh_w_9_transpose_x_0 = const()[name = string("mh_w_9_transpose_x_0"), val = bool(true)];
+            bool mh_w_9_transpose_y_0 = const()[name = string("mh_w_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_9_cast_fp16 = matmul(transpose_x = mh_w_9_transpose_x_0, transpose_y = mh_w_9_transpose_y_0, x = var_1252_cast_fp16, y = key_heads_11_cast_fp16)[name = string("mh_w_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_11_cast_fp16 = add(x = mh_w_9_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_11_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_1264_cast_fp16 = softmax(axis = var_936, x = mh_w_11_cast_fp16)[name = string("op_1264_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = value_heads_11_cast_fp16, y = var_1264_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1269 = const()[name = string("op_1269"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_17_cast_fp16 = reshape(shape = var_1269, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            string obj_27_pad_type_0 = const()[name = string("obj_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_27_strides_0 = const()[name = string("obj_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_27_pad_0 = const()[name = string("obj_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_27_dilations_0 = const()[name = string("obj_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_27_groups_0 = const()[name = string("obj_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35748480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37845696))))[name = string("layers_2_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_27_dilations_0, groups = obj_27_groups_0, pad = obj_27_pad_0, pad_type = obj_27_pad_type_0, strides = obj_27_strides_0, weight = layers_2_self_attn_o_proj_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = string("obj_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_21_cast_fp16 = add(x = inputs_15_cast_fp16, y = obj_27_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1287_to_fp16 = const()[name = string("op_1287_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1288_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1287_to_fp16)[name = string("op_1288_cast_fp16")];
+            fp32 var_1289_epsilon_0 = const()[name = string("op_1289_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1289_cast_fp16 = rsqrt(epsilon = var_1289_epsilon_0, x = var_1288_cast_fp16)[name = string("op_1289_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_1289_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37846272)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_19_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_19_cast_fp16")];
+            string input_21_pad_type_0 = const()[name = string("input_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_21_strides_0 = const()[name = string("input_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_21_pad_0 = const()[name = string("input_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_21_dilations_0 = const()[name = string("input_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_21_groups_0 = const()[name = string("input_21_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37848384))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40994176))))[name = string("layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_21_cast_fp16 = conv(dilations = input_21_dilations_0, groups = input_21_groups_0, pad = input_21_pad_0, pad_type = input_21_pad_type_0, strides = input_21_strides_0, weight = layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("input_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1303_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_1303_cast_fp16")];
+            string var_1309_pad_type_0 = const()[name = string("op_1309_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1309_strides_0 = const()[name = string("op_1309_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1309_pad_0 = const()[name = string("op_1309_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1309_dilations_0 = const()[name = string("op_1309_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1309_groups_0 = const()[name = string("op_1309_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40994752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44140544))))[name = string("layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1309_cast_fp16 = conv(dilations = var_1309_dilations_0, groups = var_1309_groups_0, pad = var_1309_pad_0, pad_type = var_1309_pad_type_0, strides = var_1309_strides_0, weight = layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("op_1309_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_23_cast_fp16 = mul(x = var_1303_cast_fp16, y = var_1309_cast_fp16)[name = string("input_23_cast_fp16")];
+            string hidden_states_29_pad_type_0 = const()[name = string("hidden_states_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_29_strides_0 = const()[name = string("hidden_states_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_29_pad_0 = const()[name = string("hidden_states_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_29_dilations_0 = const()[name = string("hidden_states_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_29_groups_0 = const()[name = string("hidden_states_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_2_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44141120))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47286912))))[name = string("layers_2_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_29_cast_fp16 = conv(dilations = hidden_states_29_dilations_0, groups = hidden_states_29_groups_0, pad = hidden_states_29_pad_0, pad_type = hidden_states_29_pad_type_0, strides = hidden_states_29_strides_0, weight = layers_2_mlp_down_proj_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_21_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            int32 var_1323 = const()[name = string("op_1323"), val = int32(3)];
+            int32 var_1333 = const()[name = string("op_1333"), val = int32(-2)];
+            int32 var_1341 = const()[name = string("op_1341"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1353_to_fp16 = const()[name = string("op_1353_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1354_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1353_to_fp16)[name = string("op_1354_cast_fp16")];
+            fp32 var_1355_epsilon_0 = const()[name = string("op_1355_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1355_cast_fp16 = rsqrt(epsilon = var_1355_epsilon_0, x = var_1354_cast_fp16)[name = string("op_1355_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1355_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47287488)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_29_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_31_cast_fp16)[name = string("obj_29_cast_fp16")];
+            string query_19_pad_type_0 = const()[name = string("query_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_19_strides_0 = const()[name = string("query_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_19_pad_0 = const()[name = string("query_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_19_dilations_0 = const()[name = string("query_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_19_groups_0 = const()[name = string("query_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47289600))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49386816))))[name = string("layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_19_dilations_0, groups = query_19_groups_0, pad = query_19_pad_0, pad_type = query_19_pad_type_0, strides = query_19_strides_0, weight = layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("query_19_cast_fp16")];
+            string current_key_13_pad_type_0 = const()[name = string("current_key_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_13_strides_0 = const()[name = string("current_key_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_13_pad_0 = const()[name = string("current_key_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_13_dilations_0 = const()[name = string("current_key_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_13_groups_0 = const()[name = string("current_key_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49387392))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50436032))))[name = string("layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = conv(dilations = current_key_13_dilations_0, groups = current_key_13_groups_0, pad = current_key_13_pad_0, pad_type = current_key_13_pad_type_0, strides = current_key_13_strides_0, weight = layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            string current_value_7_pad_type_0 = const()[name = string("current_value_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_7_strides_0 = const()[name = string("current_value_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_7_pad_0 = const()[name = string("current_value_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_7_dilations_0 = const()[name = string("current_value_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_7_groups_0 = const()[name = string("current_value_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50436608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51485248))))[name = string("layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_7_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_7_dilations_0, groups = current_value_7_groups_0, pad = current_value_7_pad_0, pad_type = current_value_7_pad_type_0, strides = current_value_7_strides_0, weight = layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_value_7_cast_fp16")];
+            tensor<int32, [4]> var_1392 = const()[name = string("op_1392"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_25_cast_fp16 = reshape(shape = var_1392, x = query_19_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1398_to_fp16 = const()[name = string("op_1398_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1399_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1398_to_fp16)[name = string("op_1399_cast_fp16")];
+            fp32 var_1400_epsilon_0 = const()[name = string("op_1400_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1400_cast_fp16 = rsqrt(epsilon = var_1400_epsilon_0, x = var_1399_cast_fp16)[name = string("op_1400_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_33_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1400_cast_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51485824)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_7_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_33_cast_fp16)[name = string("query_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1408 = const()[name = string("op_1408"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_27_cast_fp16 = reshape(shape = var_1408, x = current_key_13_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1414_to_fp16 = const()[name = string("op_1414_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1415_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1414_to_fp16)[name = string("op_1415_cast_fp16")];
+            fp32 var_1416_epsilon_0 = const()[name = string("op_1416_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1416_cast_fp16 = rsqrt(epsilon = var_1416_epsilon_0, x = var_1415_cast_fp16)[name = string("op_1416_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_35_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1416_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51486144)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_7_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_35_cast_fp16)[name = string("current_key_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1434 = const()[name = string("op_1434"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_19_cast_fp16 = reshape(shape = var_1434, x = query_normed_7_cast_fp16)[name = string("mh_q_19_cast_fp16")];
+            tensor<int32, [4]> var_1436 = const()[name = string("op_1436"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_13_cast_fp16 = reshape(shape = var_1436, x = current_key_normed_7_cast_fp16)[name = string("mh_k_13_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1440_cast_fp16 = mul(x = mh_q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1440_cast_fp16")];
+            tensor<int32, [4]> var_1445_begin_0 = const()[name = string("op_1445_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1445_end_0 = const()[name = string("op_1445_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1445_end_mask_0 = const()[name = string("op_1445_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1445_cast_fp16 = slice_by_index(begin = var_1445_begin_0, end = var_1445_end_0, end_mask = var_1445_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1445_cast_fp16")];
+            tensor<int32, [4]> var_1451_begin_0 = const()[name = string("op_1451_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1451_end_0 = const()[name = string("op_1451_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1451_end_mask_0 = const()[name = string("op_1451_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1451_cast_fp16 = slice_by_index(begin = var_1451_begin_0, end = var_1451_end_0, end_mask = var_1451_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1451_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1453_cast_fp16 = mul(x = var_1451_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1453_cast_fp16")];
+            bool var_1455_interleave_0 = const()[name = string("op_1455_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1455_cast_fp16 = concat(axis = var_1333, interleave = var_1455_interleave_0, values = (var_1453_cast_fp16, var_1445_cast_fp16))[name = string("op_1455_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1456_cast_fp16 = mul(x = var_1455_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1456_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_21_cast_fp16 = add(x = var_1440_cast_fp16, y = var_1456_cast_fp16)[name = string("mh_q_21_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1458_cast_fp16 = mul(x = mh_k_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1458_cast_fp16")];
+            tensor<int32, [4]> var_1463_begin_0 = const()[name = string("op_1463_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1463_end_0 = const()[name = string("op_1463_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1463_end_mask_0 = const()[name = string("op_1463_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1463_cast_fp16 = slice_by_index(begin = var_1463_begin_0, end = var_1463_end_0, end_mask = var_1463_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1463_cast_fp16")];
+            tensor<int32, [4]> var_1469_begin_0 = const()[name = string("op_1469_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1469_end_0 = const()[name = string("op_1469_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1469_end_mask_0 = const()[name = string("op_1469_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1469_cast_fp16 = slice_by_index(begin = var_1469_begin_0, end = var_1469_end_0, end_mask = var_1469_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1469_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1471_cast_fp16 = mul(x = var_1469_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1471_cast_fp16")];
+            bool var_1473_interleave_0 = const()[name = string("op_1473_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1473_cast_fp16 = concat(axis = var_1333, interleave = var_1473_interleave_0, values = (var_1471_cast_fp16, var_1463_cast_fp16))[name = string("op_1473_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1474_cast_fp16 = mul(x = var_1473_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1474_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_15_cast_fp16 = add(x = var_1458_cast_fp16, y = var_1474_cast_fp16)[name = string("mh_k_15_cast_fp16")];
+            tensor<int32, [4]> var_1478 = const()[name = string("op_1478"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_15_cast_fp16 = reshape(shape = var_1478, x = mh_k_15_cast_fp16)[name = string("current_key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1485_cast_fp16 = mul(x = var_101_cast_fp16_3, y = var_323_cast_fp16)[name = string("op_1485_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1486_cast_fp16 = mul(x = current_key_15_cast_fp16, y = var_321_cast_fp16)[name = string("op_1486_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_21_cast_fp16 = add(x = var_1485_cast_fp16, y = var_1486_cast_fp16)[name = string("key_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1489_cast_fp16 = mul(x = var_132_cast_fp16_3, y = var_323_cast_fp16)[name = string("op_1489_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1490_cast_fp16 = mul(x = current_value_7_cast_fp16, y = var_321_cast_fp16)[name = string("op_1490_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_13_cast_fp16 = add(x = var_1489_cast_fp16, y = var_1490_cast_fp16)[name = string("value_13_cast_fp16")];
+            tensor<int32, [4]> var_1494 = const()[name = string("op_1494"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_13_cast_fp16 = reshape(shape = var_1494, x = key_21_cast_fp16)[name = string("key_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1496 = const()[name = string("op_1496"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_13_cast_fp16 = reshape(shape = var_1496, x = value_13_cast_fp16)[name = string("value_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1499_begin_0 = const()[name = string("op_1499_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1499_end_0 = const()[name = string("op_1499_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1499_end_mask_0 = const()[name = string("op_1499_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1499_cast_fp16 = slice_by_index(begin = var_1499_begin_0, end = var_1499_end_0, end_mask = var_1499_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1499_cast_fp16")];
+            tensor<int32, [4]> var_1503_begin_0 = const()[name = string("op_1503_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1503_end_0 = const()[name = string("op_1503_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1503_end_mask_0 = const()[name = string("op_1503_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1503_cast_fp16 = slice_by_index(begin = var_1503_begin_0, end = var_1503_end_0, end_mask = var_1503_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1503_cast_fp16")];
+            tensor<int32, [4]> var_1515_begin_0 = const()[name = string("op_1515_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1515_end_0 = const()[name = string("op_1515_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1515_end_mask_0 = const()[name = string("op_1515_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1515_cast_fp16 = slice_by_index(begin = var_1515_begin_0, end = var_1515_end_0, end_mask = var_1515_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1515_cast_fp16")];
+            tensor<int32, [4]> var_1519_begin_0 = const()[name = string("op_1519_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1519_end_0 = const()[name = string("op_1519_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1519_end_mask_0 = const()[name = string("op_1519_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1519_cast_fp16 = slice_by_index(begin = var_1519_begin_0, end = var_1519_end_0, end_mask = var_1519_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1519_cast_fp16")];
+            tensor<int32, [4]> var_1531_begin_0 = const()[name = string("op_1531_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1531_end_0 = const()[name = string("op_1531_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1531_end_mask_0 = const()[name = string("op_1531_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1531_cast_fp16 = slice_by_index(begin = var_1531_begin_0, end = var_1531_end_0, end_mask = var_1531_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1531_cast_fp16")];
+            tensor<int32, [4]> var_1535_begin_0 = const()[name = string("op_1535_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1535_end_0 = const()[name = string("op_1535_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1535_end_mask_0 = const()[name = string("op_1535_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1535_cast_fp16 = slice_by_index(begin = var_1535_begin_0, end = var_1535_end_0, end_mask = var_1535_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1535_cast_fp16")];
+            tensor<int32, [4]> var_1547_begin_0 = const()[name = string("op_1547_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1547_end_0 = const()[name = string("op_1547_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1547_end_mask_0 = const()[name = string("op_1547_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1547_cast_fp16 = slice_by_index(begin = var_1547_begin_0, end = var_1547_end_0, end_mask = var_1547_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1547_cast_fp16")];
+            tensor<int32, [4]> var_1551_begin_0 = const()[name = string("op_1551_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1551_end_0 = const()[name = string("op_1551_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1551_end_mask_0 = const()[name = string("op_1551_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1551_cast_fp16 = slice_by_index(begin = var_1551_begin_0, end = var_1551_end_0, end_mask = var_1551_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1551_cast_fp16")];
+            tensor<int32, [4]> var_1563_begin_0 = const()[name = string("op_1563_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1563_end_0 = const()[name = string("op_1563_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1563_end_mask_0 = const()[name = string("op_1563_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1563_cast_fp16 = slice_by_index(begin = var_1563_begin_0, end = var_1563_end_0, end_mask = var_1563_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1563_cast_fp16")];
+            tensor<int32, [4]> var_1567_begin_0 = const()[name = string("op_1567_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1567_end_0 = const()[name = string("op_1567_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1567_end_mask_0 = const()[name = string("op_1567_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1567_cast_fp16 = slice_by_index(begin = var_1567_begin_0, end = var_1567_end_0, end_mask = var_1567_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1567_cast_fp16")];
+            tensor<int32, [4]> var_1579_begin_0 = const()[name = string("op_1579_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1579_end_0 = const()[name = string("op_1579_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1579_end_mask_0 = const()[name = string("op_1579_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1579_cast_fp16 = slice_by_index(begin = var_1579_begin_0, end = var_1579_end_0, end_mask = var_1579_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1579_cast_fp16")];
+            tensor<int32, [4]> var_1583_begin_0 = const()[name = string("op_1583_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1583_end_0 = const()[name = string("op_1583_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1583_end_mask_0 = const()[name = string("op_1583_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1583_cast_fp16 = slice_by_index(begin = var_1583_begin_0, end = var_1583_end_0, end_mask = var_1583_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1583_cast_fp16")];
+            tensor<int32, [4]> var_1595_begin_0 = const()[name = string("op_1595_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1595_end_0 = const()[name = string("op_1595_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1595_end_mask_0 = const()[name = string("op_1595_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1595_cast_fp16 = slice_by_index(begin = var_1595_begin_0, end = var_1595_end_0, end_mask = var_1595_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1595_cast_fp16")];
+            tensor<int32, [4]> var_1599_begin_0 = const()[name = string("op_1599_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1599_end_0 = const()[name = string("op_1599_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1599_end_mask_0 = const()[name = string("op_1599_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1599_cast_fp16 = slice_by_index(begin = var_1599_begin_0, end = var_1599_end_0, end_mask = var_1599_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1599_cast_fp16")];
+            tensor<int32, [4]> var_1611_begin_0 = const()[name = string("op_1611_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1611_end_0 = const()[name = string("op_1611_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1611_end_mask_0 = const()[name = string("op_1611_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1611_cast_fp16 = slice_by_index(begin = var_1611_begin_0, end = var_1611_end_0, end_mask = var_1611_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1611_cast_fp16")];
+            tensor<int32, [4]> var_1615_begin_0 = const()[name = string("op_1615_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1615_end_0 = const()[name = string("op_1615_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1615_end_mask_0 = const()[name = string("op_1615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1615_cast_fp16 = slice_by_index(begin = var_1615_begin_0, end = var_1615_end_0, end_mask = var_1615_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1615_cast_fp16")];
+            bool key_heads_15_interleave_0 = const()[name = string("key_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_15_cast_fp16 = concat(axis = var_1341, interleave = key_heads_15_interleave_0, values = (var_1499_cast_fp16, var_1499_cast_fp16, var_1515_cast_fp16, var_1515_cast_fp16, var_1531_cast_fp16, var_1531_cast_fp16, var_1547_cast_fp16, var_1547_cast_fp16, var_1563_cast_fp16, var_1563_cast_fp16, var_1579_cast_fp16, var_1579_cast_fp16, var_1595_cast_fp16, var_1595_cast_fp16, var_1611_cast_fp16, var_1611_cast_fp16))[name = string("key_heads_15_cast_fp16")];
+            bool value_heads_15_interleave_0 = const()[name = string("value_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_15_cast_fp16 = concat(axis = var_1341, interleave = value_heads_15_interleave_0, values = (var_1503_cast_fp16, var_1503_cast_fp16, var_1519_cast_fp16, var_1519_cast_fp16, var_1535_cast_fp16, var_1535_cast_fp16, var_1551_cast_fp16, var_1551_cast_fp16, var_1567_cast_fp16, var_1567_cast_fp16, var_1583_cast_fp16, var_1583_cast_fp16, var_1599_cast_fp16, var_1599_cast_fp16, var_1615_cast_fp16, var_1615_cast_fp16))[name = string("value_heads_15_cast_fp16")];
+            fp16 var_1638_to_fp16 = const()[name = string("op_1638_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1639_cast_fp16 = mul(x = mh_q_21_cast_fp16, y = var_1638_to_fp16)[name = string("op_1639_cast_fp16")];
+            bool mh_w_13_transpose_x_0 = const()[name = string("mh_w_13_transpose_x_0"), val = bool(true)];
+            bool mh_w_13_transpose_y_0 = const()[name = string("mh_w_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_13_cast_fp16 = matmul(transpose_x = mh_w_13_transpose_x_0, transpose_y = mh_w_13_transpose_y_0, x = var_1639_cast_fp16, y = key_heads_15_cast_fp16)[name = string("mh_w_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_15_cast_fp16 = add(x = mh_w_13_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_15_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_1651_cast_fp16 = softmax(axis = var_1323, x = mh_w_15_cast_fp16)[name = string("op_1651_cast_fp16")];
+            bool attn_7_transpose_x_0 = const()[name = string("attn_7_transpose_x_0"), val = bool(false)];
+            bool attn_7_transpose_y_0 = const()[name = string("attn_7_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_7_cast_fp16 = matmul(transpose_x = attn_7_transpose_x_0, transpose_y = attn_7_transpose_y_0, x = value_heads_15_cast_fp16, y = var_1651_cast_fp16)[name = string("attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1656 = const()[name = string("op_1656"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_25_cast_fp16 = reshape(shape = var_1656, x = attn_7_cast_fp16)[name = string("input_25_cast_fp16")];
+            string obj_35_pad_type_0 = const()[name = string("obj_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_35_strides_0 = const()[name = string("obj_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_35_pad_0 = const()[name = string("obj_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_35_dilations_0 = const()[name = string("obj_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_35_groups_0 = const()[name = string("obj_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51486464))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53583680))))[name = string("layers_3_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_35_dilations_0, groups = obj_35_groups_0, pad = obj_35_pad_0, pad_type = obj_35_pad_type_0, strides = obj_35_strides_0, weight = layers_3_self_attn_o_proj_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = string("obj_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_29_cast_fp16 = add(x = inputs_23_cast_fp16, y = obj_35_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_1674_to_fp16 = const()[name = string("op_1674_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1675_cast_fp16 = add(x = variance_31_cast_fp16, y = var_1674_to_fp16)[name = string("op_1675_cast_fp16")];
+            fp32 var_1676_epsilon_0 = const()[name = string("op_1676_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1676_cast_fp16 = rsqrt(epsilon = var_1676_epsilon_0, x = var_1675_cast_fp16)[name = string("op_1676_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_37_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1676_cast_fp16)[name = string("hidden_states_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53584256)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_27_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_37_cast_fp16)[name = string("input_27_cast_fp16")];
+            string input_29_pad_type_0 = const()[name = string("input_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_29_strides_0 = const()[name = string("input_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_29_pad_0 = const()[name = string("input_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_29_dilations_0 = const()[name = string("input_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_29_groups_0 = const()[name = string("input_29_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53586368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56732160))))[name = string("layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_29_cast_fp16 = conv(dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("input_29_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1690_cast_fp16 = silu(x = input_29_cast_fp16)[name = string("op_1690_cast_fp16")];
+            string var_1696_pad_type_0 = const()[name = string("op_1696_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1696_strides_0 = const()[name = string("op_1696_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1696_pad_0 = const()[name = string("op_1696_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1696_dilations_0 = const()[name = string("op_1696_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1696_groups_0 = const()[name = string("op_1696_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56732736))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59878528))))[name = string("layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1696_cast_fp16 = conv(dilations = var_1696_dilations_0, groups = var_1696_groups_0, pad = var_1696_pad_0, pad_type = var_1696_pad_type_0, strides = var_1696_strides_0, weight = layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("op_1696_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_31_cast_fp16 = mul(x = var_1690_cast_fp16, y = var_1696_cast_fp16)[name = string("input_31_cast_fp16")];
+            string hidden_states_39_pad_type_0 = const()[name = string("hidden_states_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_39_strides_0 = const()[name = string("hidden_states_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_39_pad_0 = const()[name = string("hidden_states_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_39_dilations_0 = const()[name = string("hidden_states_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_39_groups_0 = const()[name = string("hidden_states_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_3_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59879104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63024896))))[name = string("layers_3_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_39_cast_fp16 = conv(dilations = hidden_states_39_dilations_0, groups = hidden_states_39_groups_0, pad = hidden_states_39_pad_0, pad_type = hidden_states_39_pad_type_0, strides = hidden_states_39_strides_0, weight = layers_3_mlp_down_proj_weight_to_fp16_palettized, x = input_31_cast_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_29_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            int32 var_1710 = const()[name = string("op_1710"), val = int32(3)];
+            int32 var_1720 = const()[name = string("op_1720"), val = int32(-2)];
+            int32 var_1728 = const()[name = string("op_1728"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_33_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_33_cast_fp16")];
+            tensor<int32, [1]> variance_33_axes_0 = const()[name = string("variance_33_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_33_keep_dims_0 = const()[name = string("variance_33_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_33_cast_fp16 = reduce_mean(axes = variance_33_axes_0, keep_dims = variance_33_keep_dims_0, x = inputs_sq_33_cast_fp16)[name = string("variance_33_cast_fp16")];
+            fp16 var_1740_to_fp16 = const()[name = string("op_1740_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1741_cast_fp16 = add(x = variance_33_cast_fp16, y = var_1740_to_fp16)[name = string("op_1741_cast_fp16")];
+            fp32 var_1742_epsilon_0 = const()[name = string("op_1742_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1742_cast_fp16 = rsqrt(epsilon = var_1742_epsilon_0, x = var_1741_cast_fp16)[name = string("op_1742_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_41_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_1742_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_33_to_fp16 = const()[name = string("w_33_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63025472)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_37_cast_fp16 = mul(x = w_33_to_fp16, y = hidden_states_41_cast_fp16)[name = string("obj_37_cast_fp16")];
+            string query_25_pad_type_0 = const()[name = string("query_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_25_strides_0 = const()[name = string("query_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_25_pad_0 = const()[name = string("query_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_25_dilations_0 = const()[name = string("query_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_25_groups_0 = const()[name = string("query_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63027584))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65124800))))[name = string("layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_25_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_25_dilations_0, groups = query_25_groups_0, pad = query_25_pad_0, pad_type = query_25_pad_type_0, strides = query_25_strides_0, weight = layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("query_25_cast_fp16")];
+            string current_key_17_pad_type_0 = const()[name = string("current_key_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_17_strides_0 = const()[name = string("current_key_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_17_pad_0 = const()[name = string("current_key_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_17_dilations_0 = const()[name = string("current_key_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_17_groups_0 = const()[name = string("current_key_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65125376))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66174016))))[name = string("layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_17_cast_fp16 = conv(dilations = current_key_17_dilations_0, groups = current_key_17_groups_0, pad = current_key_17_pad_0, pad_type = current_key_17_pad_type_0, strides = current_key_17_strides_0, weight = layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_key_17_cast_fp16")];
+            string current_value_9_pad_type_0 = const()[name = string("current_value_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_9_strides_0 = const()[name = string("current_value_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_9_pad_0 = const()[name = string("current_value_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_9_dilations_0 = const()[name = string("current_value_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_9_groups_0 = const()[name = string("current_value_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66174592))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67223232))))[name = string("layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_9_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_9_dilations_0, groups = current_value_9_groups_0, pad = current_value_9_pad_0, pad_type = current_value_9_pad_type_0, strides = current_value_9_strides_0, weight = layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_value_9_cast_fp16")];
+            tensor<int32, [4]> var_1779 = const()[name = string("op_1779"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_33_cast_fp16 = reshape(shape = var_1779, x = query_25_cast_fp16)[name = string("inputs_33_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_35_cast_fp16 = mul(x = inputs_33_cast_fp16, y = inputs_33_cast_fp16)[name = string("inputs_sq_35_cast_fp16")];
+            tensor<int32, [1]> variance_35_axes_0 = const()[name = string("variance_35_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_35_keep_dims_0 = const()[name = string("variance_35_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_35_cast_fp16 = reduce_mean(axes = variance_35_axes_0, keep_dims = variance_35_keep_dims_0, x = inputs_sq_35_cast_fp16)[name = string("variance_35_cast_fp16")];
+            fp16 var_1785_to_fp16 = const()[name = string("op_1785_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1786_cast_fp16 = add(x = variance_35_cast_fp16, y = var_1785_to_fp16)[name = string("op_1786_cast_fp16")];
+            fp32 var_1787_epsilon_0 = const()[name = string("op_1787_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1787_cast_fp16 = rsqrt(epsilon = var_1787_epsilon_0, x = var_1786_cast_fp16)[name = string("op_1787_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_43_cast_fp16 = mul(x = inputs_33_cast_fp16, y = var_1787_cast_fp16)[name = string("hidden_states_43_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_35_to_fp16 = const()[name = string("w_35_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67223808)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_9_cast_fp16 = mul(x = w_35_to_fp16, y = hidden_states_43_cast_fp16)[name = string("query_normed_9_cast_fp16")];
+            tensor<int32, [4]> var_1795 = const()[name = string("op_1795"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_35_cast_fp16 = reshape(shape = var_1795, x = current_key_17_cast_fp16)[name = string("inputs_35_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_37_cast_fp16 = mul(x = inputs_35_cast_fp16, y = inputs_35_cast_fp16)[name = string("inputs_sq_37_cast_fp16")];
+            tensor<int32, [1]> variance_37_axes_0 = const()[name = string("variance_37_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_37_keep_dims_0 = const()[name = string("variance_37_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_37_cast_fp16 = reduce_mean(axes = variance_37_axes_0, keep_dims = variance_37_keep_dims_0, x = inputs_sq_37_cast_fp16)[name = string("variance_37_cast_fp16")];
+            fp16 var_1801_to_fp16 = const()[name = string("op_1801_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1802_cast_fp16 = add(x = variance_37_cast_fp16, y = var_1801_to_fp16)[name = string("op_1802_cast_fp16")];
+            fp32 var_1803_epsilon_0 = const()[name = string("op_1803_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1803_cast_fp16 = rsqrt(epsilon = var_1803_epsilon_0, x = var_1802_cast_fp16)[name = string("op_1803_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_45_cast_fp16 = mul(x = inputs_35_cast_fp16, y = var_1803_cast_fp16)[name = string("hidden_states_45_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_37_to_fp16 = const()[name = string("w_37_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67224128)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_9_cast_fp16 = mul(x = w_37_to_fp16, y = hidden_states_45_cast_fp16)[name = string("current_key_normed_9_cast_fp16")];
+            tensor<int32, [4]> var_1821 = const()[name = string("op_1821"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_25_cast_fp16 = reshape(shape = var_1821, x = query_normed_9_cast_fp16)[name = string("mh_q_25_cast_fp16")];
+            tensor<int32, [4]> var_1823 = const()[name = string("op_1823"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_17_cast_fp16 = reshape(shape = var_1823, x = current_key_normed_9_cast_fp16)[name = string("mh_k_17_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1827_cast_fp16 = mul(x = mh_q_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1827_cast_fp16")];
+            tensor<int32, [4]> var_1832_begin_0 = const()[name = string("op_1832_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1832_end_0 = const()[name = string("op_1832_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1832_end_mask_0 = const()[name = string("op_1832_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1832_cast_fp16 = slice_by_index(begin = var_1832_begin_0, end = var_1832_end_0, end_mask = var_1832_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1832_cast_fp16")];
+            tensor<int32, [4]> var_1838_begin_0 = const()[name = string("op_1838_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1838_end_0 = const()[name = string("op_1838_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1838_end_mask_0 = const()[name = string("op_1838_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1838_cast_fp16 = slice_by_index(begin = var_1838_begin_0, end = var_1838_end_0, end_mask = var_1838_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1838_cast_fp16")];
+            fp16 const_109_promoted_to_fp16 = const()[name = string("const_109_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1840_cast_fp16 = mul(x = var_1838_cast_fp16, y = const_109_promoted_to_fp16)[name = string("op_1840_cast_fp16")];
+            bool var_1842_interleave_0 = const()[name = string("op_1842_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1842_cast_fp16 = concat(axis = var_1720, interleave = var_1842_interleave_0, values = (var_1840_cast_fp16, var_1832_cast_fp16))[name = string("op_1842_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1843_cast_fp16 = mul(x = var_1842_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1843_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_27_cast_fp16 = add(x = var_1827_cast_fp16, y = var_1843_cast_fp16)[name = string("mh_q_27_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1845_cast_fp16 = mul(x = mh_k_17_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1845_cast_fp16")];
+            tensor<int32, [4]> var_1850_begin_0 = const()[name = string("op_1850_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1850_end_0 = const()[name = string("op_1850_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1850_end_mask_0 = const()[name = string("op_1850_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1850_cast_fp16 = slice_by_index(begin = var_1850_begin_0, end = var_1850_end_0, end_mask = var_1850_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1850_cast_fp16")];
+            tensor<int32, [4]> var_1856_begin_0 = const()[name = string("op_1856_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1856_end_0 = const()[name = string("op_1856_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1856_end_mask_0 = const()[name = string("op_1856_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1856_cast_fp16 = slice_by_index(begin = var_1856_begin_0, end = var_1856_end_0, end_mask = var_1856_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1856_cast_fp16")];
+            fp16 const_112_promoted_to_fp16 = const()[name = string("const_112_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1858_cast_fp16 = mul(x = var_1856_cast_fp16, y = const_112_promoted_to_fp16)[name = string("op_1858_cast_fp16")];
+            bool var_1860_interleave_0 = const()[name = string("op_1860_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1860_cast_fp16 = concat(axis = var_1720, interleave = var_1860_interleave_0, values = (var_1858_cast_fp16, var_1850_cast_fp16))[name = string("op_1860_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1861_cast_fp16 = mul(x = var_1860_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1861_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_19_cast_fp16 = add(x = var_1845_cast_fp16, y = var_1861_cast_fp16)[name = string("mh_k_19_cast_fp16")];
+            tensor<int32, [4]> var_1865 = const()[name = string("op_1865"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_19_cast_fp16 = reshape(shape = var_1865, x = mh_k_19_cast_fp16)[name = string("current_key_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1872_cast_fp16 = mul(x = var_101_cast_fp16_4, y = var_323_cast_fp16)[name = string("op_1872_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1873_cast_fp16 = mul(x = current_key_19_cast_fp16, y = var_321_cast_fp16)[name = string("op_1873_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_27_cast_fp16 = add(x = var_1872_cast_fp16, y = var_1873_cast_fp16)[name = string("key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1876_cast_fp16 = mul(x = var_132_cast_fp16_4, y = var_323_cast_fp16)[name = string("op_1876_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1877_cast_fp16 = mul(x = current_value_9_cast_fp16, y = var_321_cast_fp16)[name = string("op_1877_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_17_cast_fp16 = add(x = var_1876_cast_fp16, y = var_1877_cast_fp16)[name = string("value_17_cast_fp16")];
+            tensor<int32, [4]> var_1881 = const()[name = string("op_1881"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_17_cast_fp16 = reshape(shape = var_1881, x = key_27_cast_fp16)[name = string("key_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1883 = const()[name = string("op_1883"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_17_cast_fp16 = reshape(shape = var_1883, x = value_17_cast_fp16)[name = string("value_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1886_begin_0 = const()[name = string("op_1886_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1886_end_0 = const()[name = string("op_1886_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1886_end_mask_0 = const()[name = string("op_1886_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1886_cast_fp16 = slice_by_index(begin = var_1886_begin_0, end = var_1886_end_0, end_mask = var_1886_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1886_cast_fp16")];
+            tensor<int32, [4]> var_1890_begin_0 = const()[name = string("op_1890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1890_end_0 = const()[name = string("op_1890_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1890_end_mask_0 = const()[name = string("op_1890_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1890_cast_fp16 = slice_by_index(begin = var_1890_begin_0, end = var_1890_end_0, end_mask = var_1890_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1890_cast_fp16")];
+            tensor<int32, [4]> var_1902_begin_0 = const()[name = string("op_1902_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1902_end_0 = const()[name = string("op_1902_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1902_end_mask_0 = const()[name = string("op_1902_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1902_cast_fp16 = slice_by_index(begin = var_1902_begin_0, end = var_1902_end_0, end_mask = var_1902_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1902_cast_fp16")];
+            tensor<int32, [4]> var_1906_begin_0 = const()[name = string("op_1906_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1906_end_0 = const()[name = string("op_1906_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1906_end_mask_0 = const()[name = string("op_1906_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1906_cast_fp16 = slice_by_index(begin = var_1906_begin_0, end = var_1906_end_0, end_mask = var_1906_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1906_cast_fp16")];
+            tensor<int32, [4]> var_1918_begin_0 = const()[name = string("op_1918_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1918_end_0 = const()[name = string("op_1918_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1918_end_mask_0 = const()[name = string("op_1918_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1918_cast_fp16 = slice_by_index(begin = var_1918_begin_0, end = var_1918_end_0, end_mask = var_1918_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1918_cast_fp16")];
+            tensor<int32, [4]> var_1922_begin_0 = const()[name = string("op_1922_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1922_end_0 = const()[name = string("op_1922_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1922_end_mask_0 = const()[name = string("op_1922_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1922_cast_fp16 = slice_by_index(begin = var_1922_begin_0, end = var_1922_end_0, end_mask = var_1922_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1922_cast_fp16")];
+            tensor<int32, [4]> var_1934_begin_0 = const()[name = string("op_1934_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1934_end_0 = const()[name = string("op_1934_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1934_end_mask_0 = const()[name = string("op_1934_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1934_cast_fp16 = slice_by_index(begin = var_1934_begin_0, end = var_1934_end_0, end_mask = var_1934_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1934_cast_fp16")];
+            tensor<int32, [4]> var_1938_begin_0 = const()[name = string("op_1938_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1938_end_0 = const()[name = string("op_1938_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1938_end_mask_0 = const()[name = string("op_1938_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1938_cast_fp16 = slice_by_index(begin = var_1938_begin_0, end = var_1938_end_0, end_mask = var_1938_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1938_cast_fp16")];
+            tensor<int32, [4]> var_1950_begin_0 = const()[name = string("op_1950_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1950_end_0 = const()[name = string("op_1950_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1950_end_mask_0 = const()[name = string("op_1950_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1950_cast_fp16 = slice_by_index(begin = var_1950_begin_0, end = var_1950_end_0, end_mask = var_1950_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1950_cast_fp16")];
+            tensor<int32, [4]> var_1954_begin_0 = const()[name = string("op_1954_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1954_end_0 = const()[name = string("op_1954_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1954_end_mask_0 = const()[name = string("op_1954_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1954_cast_fp16 = slice_by_index(begin = var_1954_begin_0, end = var_1954_end_0, end_mask = var_1954_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1954_cast_fp16")];
+            tensor<int32, [4]> var_1966_begin_0 = const()[name = string("op_1966_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1966_end_0 = const()[name = string("op_1966_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1966_end_mask_0 = const()[name = string("op_1966_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1966_cast_fp16 = slice_by_index(begin = var_1966_begin_0, end = var_1966_end_0, end_mask = var_1966_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1966_cast_fp16")];
+            tensor<int32, [4]> var_1970_begin_0 = const()[name = string("op_1970_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1970_end_0 = const()[name = string("op_1970_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1970_end_mask_0 = const()[name = string("op_1970_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1970_cast_fp16 = slice_by_index(begin = var_1970_begin_0, end = var_1970_end_0, end_mask = var_1970_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1970_cast_fp16")];
+            tensor<int32, [4]> var_1982_begin_0 = const()[name = string("op_1982_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1982_end_0 = const()[name = string("op_1982_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1982_end_mask_0 = const()[name = string("op_1982_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1982_cast_fp16 = slice_by_index(begin = var_1982_begin_0, end = var_1982_end_0, end_mask = var_1982_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1982_cast_fp16")];
+            tensor<int32, [4]> var_1986_begin_0 = const()[name = string("op_1986_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1986_end_0 = const()[name = string("op_1986_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1986_end_mask_0 = const()[name = string("op_1986_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1986_cast_fp16 = slice_by_index(begin = var_1986_begin_0, end = var_1986_end_0, end_mask = var_1986_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1986_cast_fp16")];
+            tensor<int32, [4]> var_1998_begin_0 = const()[name = string("op_1998_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1998_end_0 = const()[name = string("op_1998_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1998_end_mask_0 = const()[name = string("op_1998_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1998_cast_fp16 = slice_by_index(begin = var_1998_begin_0, end = var_1998_end_0, end_mask = var_1998_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1998_cast_fp16")];
+            tensor<int32, [4]> var_2002_begin_0 = const()[name = string("op_2002_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2002_end_0 = const()[name = string("op_2002_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2002_end_mask_0 = const()[name = string("op_2002_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2002_cast_fp16 = slice_by_index(begin = var_2002_begin_0, end = var_2002_end_0, end_mask = var_2002_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_2002_cast_fp16")];
+            bool key_heads_19_interleave_0 = const()[name = string("key_heads_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_19_cast_fp16 = concat(axis = var_1728, interleave = key_heads_19_interleave_0, values = (var_1886_cast_fp16, var_1886_cast_fp16, var_1902_cast_fp16, var_1902_cast_fp16, var_1918_cast_fp16, var_1918_cast_fp16, var_1934_cast_fp16, var_1934_cast_fp16, var_1950_cast_fp16, var_1950_cast_fp16, var_1966_cast_fp16, var_1966_cast_fp16, var_1982_cast_fp16, var_1982_cast_fp16, var_1998_cast_fp16, var_1998_cast_fp16))[name = string("key_heads_19_cast_fp16")];
+            bool value_heads_19_interleave_0 = const()[name = string("value_heads_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_19_cast_fp16 = concat(axis = var_1728, interleave = value_heads_19_interleave_0, values = (var_1890_cast_fp16, var_1890_cast_fp16, var_1906_cast_fp16, var_1906_cast_fp16, var_1922_cast_fp16, var_1922_cast_fp16, var_1938_cast_fp16, var_1938_cast_fp16, var_1954_cast_fp16, var_1954_cast_fp16, var_1970_cast_fp16, var_1970_cast_fp16, var_1986_cast_fp16, var_1986_cast_fp16, var_2002_cast_fp16, var_2002_cast_fp16))[name = string("value_heads_19_cast_fp16")];
+            fp16 var_2025_to_fp16 = const()[name = string("op_2025_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_2026_cast_fp16 = mul(x = mh_q_27_cast_fp16, y = var_2025_to_fp16)[name = string("op_2026_cast_fp16")];
+            bool mh_w_17_transpose_x_0 = const()[name = string("mh_w_17_transpose_x_0"), val = bool(true)];
+            bool mh_w_17_transpose_y_0 = const()[name = string("mh_w_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_17_cast_fp16 = matmul(transpose_x = mh_w_17_transpose_x_0, transpose_y = mh_w_17_transpose_y_0, x = var_2026_cast_fp16, y = key_heads_19_cast_fp16)[name = string("mh_w_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_19_cast_fp16 = add(x = mh_w_17_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_19_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_2038_cast_fp16 = softmax(axis = var_1710, x = mh_w_19_cast_fp16)[name = string("op_2038_cast_fp16")];
+            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
+            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = value_heads_19_cast_fp16, y = var_2038_cast_fp16)[name = string("attn_9_cast_fp16")];
+            tensor<int32, [4]> var_2043 = const()[name = string("op_2043"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_33_cast_fp16 = reshape(shape = var_2043, x = attn_9_cast_fp16)[name = string("input_33_cast_fp16")];
+            string obj_43_pad_type_0 = const()[name = string("obj_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_43_strides_0 = const()[name = string("obj_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_43_pad_0 = const()[name = string("obj_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_43_dilations_0 = const()[name = string("obj_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_43_groups_0 = const()[name = string("obj_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67224448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69321664))))[name = string("layers_4_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_43_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_43_dilations_0, groups = obj_43_groups_0, pad = obj_43_pad_0, pad_type = obj_43_pad_type_0, strides = obj_43_strides_0, weight = layers_4_self_attn_o_proj_weight_to_fp16_palettized, x = input_33_cast_fp16)[name = string("obj_43_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_37_cast_fp16 = add(x = inputs_31_cast_fp16, y = obj_43_cast_fp16)[name = string("inputs_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_39_cast_fp16 = mul(x = inputs_37_cast_fp16, y = inputs_37_cast_fp16)[name = string("inputs_sq_39_cast_fp16")];
+            tensor<int32, [1]> variance_39_axes_0 = const()[name = string("variance_39_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_39_keep_dims_0 = const()[name = string("variance_39_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_39_cast_fp16 = reduce_mean(axes = variance_39_axes_0, keep_dims = variance_39_keep_dims_0, x = inputs_sq_39_cast_fp16)[name = string("variance_39_cast_fp16")];
+            fp16 var_2061_to_fp16 = const()[name = string("op_2061_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2062_cast_fp16 = add(x = variance_39_cast_fp16, y = var_2061_to_fp16)[name = string("op_2062_cast_fp16")];
+            fp32 var_2063_epsilon_0 = const()[name = string("op_2063_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2063_cast_fp16 = rsqrt(epsilon = var_2063_epsilon_0, x = var_2062_cast_fp16)[name = string("op_2063_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_47_cast_fp16 = mul(x = inputs_37_cast_fp16, y = var_2063_cast_fp16)[name = string("hidden_states_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_39_to_fp16 = const()[name = string("w_39_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69322240)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_35_cast_fp16 = mul(x = w_39_to_fp16, y = hidden_states_47_cast_fp16)[name = string("input_35_cast_fp16")];
+            string input_37_pad_type_0 = const()[name = string("input_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_37_strides_0 = const()[name = string("input_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_37_dilations_0 = const()[name = string("input_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_37_groups_0 = const()[name = string("input_37_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69324352))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72470144))))[name = string("layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_37_cast_fp16 = conv(dilations = input_37_dilations_0, groups = input_37_groups_0, pad = input_37_pad_0, pad_type = input_37_pad_type_0, strides = input_37_strides_0, weight = layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("input_37_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2077_cast_fp16 = silu(x = input_37_cast_fp16)[name = string("op_2077_cast_fp16")];
+            string var_2083_pad_type_0 = const()[name = string("op_2083_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2083_strides_0 = const()[name = string("op_2083_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2083_pad_0 = const()[name = string("op_2083_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2083_dilations_0 = const()[name = string("op_2083_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2083_groups_0 = const()[name = string("op_2083_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72470720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75616512))))[name = string("layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2083_cast_fp16 = conv(dilations = var_2083_dilations_0, groups = var_2083_groups_0, pad = var_2083_pad_0, pad_type = var_2083_pad_type_0, strides = var_2083_strides_0, weight = layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_2083_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_39_cast_fp16 = mul(x = var_2077_cast_fp16, y = var_2083_cast_fp16)[name = string("input_39_cast_fp16")];
+            string hidden_states_49_pad_type_0 = const()[name = string("hidden_states_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_49_strides_0 = const()[name = string("hidden_states_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_49_pad_0 = const()[name = string("hidden_states_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_49_dilations_0 = const()[name = string("hidden_states_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_49_groups_0 = const()[name = string("hidden_states_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_4_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75617088))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78762880))))[name = string("layers_4_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_49_cast_fp16 = conv(dilations = hidden_states_49_dilations_0, groups = hidden_states_49_groups_0, pad = hidden_states_49_pad_0, pad_type = hidden_states_49_pad_type_0, strides = hidden_states_49_strides_0, weight = layers_4_mlp_down_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_39_cast_fp16 = add(x = inputs_37_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("inputs_39_cast_fp16")];
+            int32 var_2097 = const()[name = string("op_2097"), val = int32(3)];
+            int32 var_2107 = const()[name = string("op_2107"), val = int32(-2)];
+            int32 var_2115 = const()[name = string("op_2115"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_41_cast_fp16 = mul(x = inputs_39_cast_fp16, y = inputs_39_cast_fp16)[name = string("inputs_sq_41_cast_fp16")];
+            tensor<int32, [1]> variance_41_axes_0 = const()[name = string("variance_41_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_41_keep_dims_0 = const()[name = string("variance_41_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_41_cast_fp16 = reduce_mean(axes = variance_41_axes_0, keep_dims = variance_41_keep_dims_0, x = inputs_sq_41_cast_fp16)[name = string("variance_41_cast_fp16")];
+            fp16 var_2127_to_fp16 = const()[name = string("op_2127_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2128_cast_fp16 = add(x = variance_41_cast_fp16, y = var_2127_to_fp16)[name = string("op_2128_cast_fp16")];
+            fp32 var_2129_epsilon_0 = const()[name = string("op_2129_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2129_cast_fp16 = rsqrt(epsilon = var_2129_epsilon_0, x = var_2128_cast_fp16)[name = string("op_2129_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_51_cast_fp16 = mul(x = inputs_39_cast_fp16, y = var_2129_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_41_to_fp16 = const()[name = string("w_41_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78763456)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_45_cast_fp16 = mul(x = w_41_to_fp16, y = hidden_states_51_cast_fp16)[name = string("obj_45_cast_fp16")];
+            string query_31_pad_type_0 = const()[name = string("query_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_31_strides_0 = const()[name = string("query_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_31_pad_0 = const()[name = string("query_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_31_dilations_0 = const()[name = string("query_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_31_groups_0 = const()[name = string("query_31_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_5_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78765568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80862784))))[name = string("layers_5_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_31_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_31_dilations_0, groups = query_31_groups_0, pad = query_31_pad_0, pad_type = query_31_pad_type_0, strides = query_31_strides_0, weight = layers_5_self_attn_q_proj_weight_to_fp16_palettized, x = obj_45_cast_fp16)[name = string("query_31_cast_fp16")];
+            string current_key_21_pad_type_0 = const()[name = string("current_key_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_21_strides_0 = const()[name = string("current_key_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_21_pad_0 = const()[name = string("current_key_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_21_dilations_0 = const()[name = string("current_key_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_21_groups_0 = const()[name = string("current_key_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80863360))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81912000))))[name = string("layers_5_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_21_cast_fp16 = conv(dilations = current_key_21_dilations_0, groups = current_key_21_groups_0, pad = current_key_21_pad_0, pad_type = current_key_21_pad_type_0, strides = current_key_21_strides_0, weight = layers_5_self_attn_k_proj_weight_to_fp16_palettized, x = obj_45_cast_fp16)[name = string("current_key_21_cast_fp16")];
+            string current_value_11_pad_type_0 = const()[name = string("current_value_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_11_strides_0 = const()[name = string("current_value_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_11_pad_0 = const()[name = string("current_value_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_11_dilations_0 = const()[name = string("current_value_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_11_groups_0 = const()[name = string("current_value_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81912576))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82961216))))[name = string("layers_5_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_11_dilations_0, groups = current_value_11_groups_0, pad = current_value_11_pad_0, pad_type = current_value_11_pad_type_0, strides = current_value_11_strides_0, weight = layers_5_self_attn_v_proj_weight_to_fp16_palettized, x = obj_45_cast_fp16)[name = string("current_value_11_cast_fp16")];
+            tensor<int32, [4]> var_2166 = const()[name = string("op_2166"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_41_cast_fp16 = reshape(shape = var_2166, x = query_31_cast_fp16)[name = string("inputs_41_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_43_cast_fp16 = mul(x = inputs_41_cast_fp16, y = inputs_41_cast_fp16)[name = string("inputs_sq_43_cast_fp16")];
+            tensor<int32, [1]> variance_43_axes_0 = const()[name = string("variance_43_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_43_keep_dims_0 = const()[name = string("variance_43_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_43_cast_fp16 = reduce_mean(axes = variance_43_axes_0, keep_dims = variance_43_keep_dims_0, x = inputs_sq_43_cast_fp16)[name = string("variance_43_cast_fp16")];
+            fp16 var_2172_to_fp16 = const()[name = string("op_2172_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_2173_cast_fp16 = add(x = variance_43_cast_fp16, y = var_2172_to_fp16)[name = string("op_2173_cast_fp16")];
+            fp32 var_2174_epsilon_0 = const()[name = string("op_2174_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_2174_cast_fp16 = rsqrt(epsilon = var_2174_epsilon_0, x = var_2173_cast_fp16)[name = string("op_2174_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_53_cast_fp16 = mul(x = inputs_41_cast_fp16, y = var_2174_cast_fp16)[name = string("hidden_states_53_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_43_to_fp16 = const()[name = string("w_43_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82961792)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_11_cast_fp16 = mul(x = w_43_to_fp16, y = hidden_states_53_cast_fp16)[name = string("query_normed_11_cast_fp16")];
+            tensor<int32, [4]> var_2182 = const()[name = string("op_2182"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_43_cast_fp16 = reshape(shape = var_2182, x = current_key_21_cast_fp16)[name = string("inputs_43_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_45_cast_fp16 = mul(x = inputs_43_cast_fp16, y = inputs_43_cast_fp16)[name = string("inputs_sq_45_cast_fp16")];
+            tensor<int32, [1]> variance_45_axes_0 = const()[name = string("variance_45_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_45_keep_dims_0 = const()[name = string("variance_45_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_45_cast_fp16 = reduce_mean(axes = variance_45_axes_0, keep_dims = variance_45_keep_dims_0, x = inputs_sq_45_cast_fp16)[name = string("variance_45_cast_fp16")];
+            fp16 var_2188_to_fp16 = const()[name = string("op_2188_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_2189_cast_fp16 = add(x = variance_45_cast_fp16, y = var_2188_to_fp16)[name = string("op_2189_cast_fp16")];
+            fp32 var_2190_epsilon_0 = const()[name = string("op_2190_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_2190_cast_fp16 = rsqrt(epsilon = var_2190_epsilon_0, x = var_2189_cast_fp16)[name = string("op_2190_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_55_cast_fp16 = mul(x = inputs_43_cast_fp16, y = var_2190_cast_fp16)[name = string("hidden_states_55_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_45_to_fp16 = const()[name = string("w_45_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82962112)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_11_cast_fp16 = mul(x = w_45_to_fp16, y = hidden_states_55_cast_fp16)[name = string("current_key_normed_11_cast_fp16")];
+            tensor<int32, [4]> var_2208 = const()[name = string("op_2208"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_31_cast_fp16 = reshape(shape = var_2208, x = query_normed_11_cast_fp16)[name = string("mh_q_31_cast_fp16")];
+            tensor<int32, [4]> var_2210 = const()[name = string("op_2210"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_21_cast_fp16 = reshape(shape = var_2210, x = current_key_normed_11_cast_fp16)[name = string("mh_k_21_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2214_cast_fp16 = mul(x = mh_q_31_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2214_cast_fp16")];
+            tensor<int32, [4]> var_2219_begin_0 = const()[name = string("op_2219_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2219_end_0 = const()[name = string("op_2219_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_2219_end_mask_0 = const()[name = string("op_2219_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2219_cast_fp16 = slice_by_index(begin = var_2219_begin_0, end = var_2219_end_0, end_mask = var_2219_end_mask_0, x = mh_q_31_cast_fp16)[name = string("op_2219_cast_fp16")];
+            tensor<int32, [4]> var_2225_begin_0 = const()[name = string("op_2225_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2225_end_0 = const()[name = string("op_2225_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_2225_end_mask_0 = const()[name = string("op_2225_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2225_cast_fp16 = slice_by_index(begin = var_2225_begin_0, end = var_2225_end_0, end_mask = var_2225_end_mask_0, x = mh_q_31_cast_fp16)[name = string("op_2225_cast_fp16")];
+            fp16 const_132_promoted_to_fp16 = const()[name = string("const_132_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_2227_cast_fp16 = mul(x = var_2225_cast_fp16, y = const_132_promoted_to_fp16)[name = string("op_2227_cast_fp16")];
+            bool var_2229_interleave_0 = const()[name = string("op_2229_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_2229_cast_fp16 = concat(axis = var_2107, interleave = var_2229_interleave_0, values = (var_2227_cast_fp16, var_2219_cast_fp16))[name = string("op_2229_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2230_cast_fp16 = mul(x = var_2229_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2230_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_33_cast_fp16 = add(x = var_2214_cast_fp16, y = var_2230_cast_fp16)[name = string("mh_q_33_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2232_cast_fp16 = mul(x = mh_k_21_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2232_cast_fp16")];
+            tensor<int32, [4]> var_2237_begin_0 = const()[name = string("op_2237_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2237_end_0 = const()[name = string("op_2237_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_2237_end_mask_0 = const()[name = string("op_2237_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2237_cast_fp16 = slice_by_index(begin = var_2237_begin_0, end = var_2237_end_0, end_mask = var_2237_end_mask_0, x = mh_k_21_cast_fp16)[name = string("op_2237_cast_fp16")];
+            tensor<int32, [4]> var_2243_begin_0 = const()[name = string("op_2243_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2243_end_0 = const()[name = string("op_2243_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_2243_end_mask_0 = const()[name = string("op_2243_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2243_cast_fp16 = slice_by_index(begin = var_2243_begin_0, end = var_2243_end_0, end_mask = var_2243_end_mask_0, x = mh_k_21_cast_fp16)[name = string("op_2243_cast_fp16")];
+            fp16 const_135_promoted_to_fp16 = const()[name = string("const_135_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_2245_cast_fp16 = mul(x = var_2243_cast_fp16, y = const_135_promoted_to_fp16)[name = string("op_2245_cast_fp16")];
+            bool var_2247_interleave_0 = const()[name = string("op_2247_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_2247_cast_fp16 = concat(axis = var_2107, interleave = var_2247_interleave_0, values = (var_2245_cast_fp16, var_2237_cast_fp16))[name = string("op_2247_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2248_cast_fp16 = mul(x = var_2247_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2248_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_23_cast_fp16 = add(x = var_2232_cast_fp16, y = var_2248_cast_fp16)[name = string("mh_k_23_cast_fp16")];
+            tensor<int32, [4]> var_2252 = const()[name = string("op_2252"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_23_cast_fp16 = reshape(shape = var_2252, x = mh_k_23_cast_fp16)[name = string("current_key_23_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2259_cast_fp16 = mul(x = var_101_cast_fp16_5, y = var_323_cast_fp16)[name = string("op_2259_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2260_cast_fp16 = mul(x = current_key_23_cast_fp16, y = var_321_cast_fp16)[name = string("op_2260_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_33_cast_fp16 = add(x = var_2259_cast_fp16, y = var_2260_cast_fp16)[name = string("key_33_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2263_cast_fp16 = mul(x = var_132_cast_fp16_5, y = var_323_cast_fp16)[name = string("op_2263_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2264_cast_fp16 = mul(x = current_value_11_cast_fp16, y = var_321_cast_fp16)[name = string("op_2264_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_21_cast_fp16 = add(x = var_2263_cast_fp16, y = var_2264_cast_fp16)[name = string("value_21_cast_fp16")];
+            tensor<int32, [4]> var_2268 = const()[name = string("op_2268"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_21_cast_fp16 = reshape(shape = var_2268, x = key_33_cast_fp16)[name = string("key_heads_21_cast_fp16")];
+            tensor<int32, [4]> var_2270 = const()[name = string("op_2270"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_21_cast_fp16 = reshape(shape = var_2270, x = value_21_cast_fp16)[name = string("value_heads_21_cast_fp16")];
+            tensor<int32, [4]> var_2273_begin_0 = const()[name = string("op_2273_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2273_end_0 = const()[name = string("op_2273_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2273_end_mask_0 = const()[name = string("op_2273_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2273_cast_fp16 = slice_by_index(begin = var_2273_begin_0, end = var_2273_end_0, end_mask = var_2273_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2273_cast_fp16")];
+            tensor<int32, [4]> var_2277_begin_0 = const()[name = string("op_2277_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2277_end_0 = const()[name = string("op_2277_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2277_end_mask_0 = const()[name = string("op_2277_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2277_cast_fp16 = slice_by_index(begin = var_2277_begin_0, end = var_2277_end_0, end_mask = var_2277_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2277_cast_fp16")];
+            tensor<int32, [4]> var_2289_begin_0 = const()[name = string("op_2289_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2289_end_0 = const()[name = string("op_2289_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2289_end_mask_0 = const()[name = string("op_2289_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2289_cast_fp16 = slice_by_index(begin = var_2289_begin_0, end = var_2289_end_0, end_mask = var_2289_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2289_cast_fp16")];
+            tensor<int32, [4]> var_2293_begin_0 = const()[name = string("op_2293_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2293_end_0 = const()[name = string("op_2293_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2293_end_mask_0 = const()[name = string("op_2293_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2293_cast_fp16 = slice_by_index(begin = var_2293_begin_0, end = var_2293_end_0, end_mask = var_2293_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2293_cast_fp16")];
+            tensor<int32, [4]> var_2305_begin_0 = const()[name = string("op_2305_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2305_end_0 = const()[name = string("op_2305_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2305_end_mask_0 = const()[name = string("op_2305_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2305_cast_fp16 = slice_by_index(begin = var_2305_begin_0, end = var_2305_end_0, end_mask = var_2305_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2305_cast_fp16")];
+            tensor<int32, [4]> var_2309_begin_0 = const()[name = string("op_2309_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2309_end_0 = const()[name = string("op_2309_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2309_end_mask_0 = const()[name = string("op_2309_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2309_cast_fp16 = slice_by_index(begin = var_2309_begin_0, end = var_2309_end_0, end_mask = var_2309_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2309_cast_fp16")];
+            tensor<int32, [4]> var_2321_begin_0 = const()[name = string("op_2321_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2321_end_0 = const()[name = string("op_2321_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2321_end_mask_0 = const()[name = string("op_2321_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2321_cast_fp16 = slice_by_index(begin = var_2321_begin_0, end = var_2321_end_0, end_mask = var_2321_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2321_cast_fp16")];
+            tensor<int32, [4]> var_2325_begin_0 = const()[name = string("op_2325_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2325_end_0 = const()[name = string("op_2325_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2325_end_mask_0 = const()[name = string("op_2325_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2325_cast_fp16 = slice_by_index(begin = var_2325_begin_0, end = var_2325_end_0, end_mask = var_2325_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2325_cast_fp16")];
+            tensor<int32, [4]> var_2337_begin_0 = const()[name = string("op_2337_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2337_end_0 = const()[name = string("op_2337_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2337_end_mask_0 = const()[name = string("op_2337_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2337_cast_fp16 = slice_by_index(begin = var_2337_begin_0, end = var_2337_end_0, end_mask = var_2337_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2337_cast_fp16")];
+            tensor<int32, [4]> var_2341_begin_0 = const()[name = string("op_2341_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2341_end_0 = const()[name = string("op_2341_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2341_end_mask_0 = const()[name = string("op_2341_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2341_cast_fp16 = slice_by_index(begin = var_2341_begin_0, end = var_2341_end_0, end_mask = var_2341_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2341_cast_fp16")];
+            tensor<int32, [4]> var_2353_begin_0 = const()[name = string("op_2353_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2353_end_0 = const()[name = string("op_2353_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2353_end_mask_0 = const()[name = string("op_2353_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2353_cast_fp16 = slice_by_index(begin = var_2353_begin_0, end = var_2353_end_0, end_mask = var_2353_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2353_cast_fp16")];
+            tensor<int32, [4]> var_2357_begin_0 = const()[name = string("op_2357_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2357_end_0 = const()[name = string("op_2357_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2357_end_mask_0 = const()[name = string("op_2357_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2357_cast_fp16 = slice_by_index(begin = var_2357_begin_0, end = var_2357_end_0, end_mask = var_2357_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2357_cast_fp16")];
+            tensor<int32, [4]> var_2369_begin_0 = const()[name = string("op_2369_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2369_end_0 = const()[name = string("op_2369_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2369_end_mask_0 = const()[name = string("op_2369_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2369_cast_fp16 = slice_by_index(begin = var_2369_begin_0, end = var_2369_end_0, end_mask = var_2369_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2369_cast_fp16")];
+            tensor<int32, [4]> var_2373_begin_0 = const()[name = string("op_2373_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2373_end_0 = const()[name = string("op_2373_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2373_end_mask_0 = const()[name = string("op_2373_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2373_cast_fp16 = slice_by_index(begin = var_2373_begin_0, end = var_2373_end_0, end_mask = var_2373_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2373_cast_fp16")];
+            tensor<int32, [4]> var_2385_begin_0 = const()[name = string("op_2385_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2385_end_0 = const()[name = string("op_2385_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2385_end_mask_0 = const()[name = string("op_2385_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2385_cast_fp16 = slice_by_index(begin = var_2385_begin_0, end = var_2385_end_0, end_mask = var_2385_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2385_cast_fp16")];
+            tensor<int32, [4]> var_2389_begin_0 = const()[name = string("op_2389_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2389_end_0 = const()[name = string("op_2389_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2389_end_mask_0 = const()[name = string("op_2389_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2389_cast_fp16 = slice_by_index(begin = var_2389_begin_0, end = var_2389_end_0, end_mask = var_2389_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2389_cast_fp16")];
+            bool key_heads_23_interleave_0 = const()[name = string("key_heads_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_23_cast_fp16 = concat(axis = var_2115, interleave = key_heads_23_interleave_0, values = (var_2273_cast_fp16, var_2273_cast_fp16, var_2289_cast_fp16, var_2289_cast_fp16, var_2305_cast_fp16, var_2305_cast_fp16, var_2321_cast_fp16, var_2321_cast_fp16, var_2337_cast_fp16, var_2337_cast_fp16, var_2353_cast_fp16, var_2353_cast_fp16, var_2369_cast_fp16, var_2369_cast_fp16, var_2385_cast_fp16, var_2385_cast_fp16))[name = string("key_heads_23_cast_fp16")];
+            bool value_heads_23_interleave_0 = const()[name = string("value_heads_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_23_cast_fp16 = concat(axis = var_2115, interleave = value_heads_23_interleave_0, values = (var_2277_cast_fp16, var_2277_cast_fp16, var_2293_cast_fp16, var_2293_cast_fp16, var_2309_cast_fp16, var_2309_cast_fp16, var_2325_cast_fp16, var_2325_cast_fp16, var_2341_cast_fp16, var_2341_cast_fp16, var_2357_cast_fp16, var_2357_cast_fp16, var_2373_cast_fp16, var_2373_cast_fp16, var_2389_cast_fp16, var_2389_cast_fp16))[name = string("value_heads_23_cast_fp16")];
+            fp16 var_2412_to_fp16 = const()[name = string("op_2412_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_2413_cast_fp16 = mul(x = mh_q_33_cast_fp16, y = var_2412_to_fp16)[name = string("op_2413_cast_fp16")];
+            bool mh_w_21_transpose_x_0 = const()[name = string("mh_w_21_transpose_x_0"), val = bool(true)];
+            bool mh_w_21_transpose_y_0 = const()[name = string("mh_w_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_21_cast_fp16 = matmul(transpose_x = mh_w_21_transpose_x_0, transpose_y = mh_w_21_transpose_y_0, x = var_2413_cast_fp16, y = key_heads_23_cast_fp16)[name = string("mh_w_21_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_23_cast_fp16 = add(x = mh_w_21_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_23_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_2425_cast_fp16 = softmax(axis = var_2097, x = mh_w_23_cast_fp16)[name = string("op_2425_cast_fp16")];
+            bool attn_11_transpose_x_0 = const()[name = string("attn_11_transpose_x_0"), val = bool(false)];
+            bool attn_11_transpose_y_0 = const()[name = string("attn_11_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_11_cast_fp16 = matmul(transpose_x = attn_11_transpose_x_0, transpose_y = attn_11_transpose_y_0, x = value_heads_23_cast_fp16, y = var_2425_cast_fp16)[name = string("attn_11_cast_fp16")];
+            tensor<int32, [4]> var_2430 = const()[name = string("op_2430"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_41_cast_fp16 = reshape(shape = var_2430, x = attn_11_cast_fp16)[name = string("input_41_cast_fp16")];
+            string obj_51_pad_type_0 = const()[name = string("obj_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_51_strides_0 = const()[name = string("obj_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_51_pad_0 = const()[name = string("obj_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_51_dilations_0 = const()[name = string("obj_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_51_groups_0 = const()[name = string("obj_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_5_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82962432))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85059648))))[name = string("layers_5_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_51_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_51_dilations_0, groups = obj_51_groups_0, pad = obj_51_pad_0, pad_type = obj_51_pad_type_0, strides = obj_51_strides_0, weight = layers_5_self_attn_o_proj_weight_to_fp16_palettized, x = input_41_cast_fp16)[name = string("obj_51_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_45_cast_fp16 = add(x = inputs_39_cast_fp16, y = obj_51_cast_fp16)[name = string("inputs_45_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_47_cast_fp16 = mul(x = inputs_45_cast_fp16, y = inputs_45_cast_fp16)[name = string("inputs_sq_47_cast_fp16")];
+            tensor<int32, [1]> variance_47_axes_0 = const()[name = string("variance_47_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_47_keep_dims_0 = const()[name = string("variance_47_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_47_cast_fp16 = reduce_mean(axes = variance_47_axes_0, keep_dims = variance_47_keep_dims_0, x = inputs_sq_47_cast_fp16)[name = string("variance_47_cast_fp16")];
+            fp16 var_2448_to_fp16 = const()[name = string("op_2448_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2449_cast_fp16 = add(x = variance_47_cast_fp16, y = var_2448_to_fp16)[name = string("op_2449_cast_fp16")];
+            fp32 var_2450_epsilon_0 = const()[name = string("op_2450_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2450_cast_fp16 = rsqrt(epsilon = var_2450_epsilon_0, x = var_2449_cast_fp16)[name = string("op_2450_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_57_cast_fp16 = mul(x = inputs_45_cast_fp16, y = var_2450_cast_fp16)[name = string("hidden_states_57_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_47_to_fp16 = const()[name = string("w_47_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85060224)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_43_cast_fp16 = mul(x = w_47_to_fp16, y = hidden_states_57_cast_fp16)[name = string("input_43_cast_fp16")];
+            string input_45_pad_type_0 = const()[name = string("input_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_45_strides_0 = const()[name = string("input_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_45_pad_0 = const()[name = string("input_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_45_dilations_0 = const()[name = string("input_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_45_groups_0 = const()[name = string("input_45_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_5_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85062336))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(88208128))))[name = string("layers_5_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_45_cast_fp16 = conv(dilations = input_45_dilations_0, groups = input_45_groups_0, pad = input_45_pad_0, pad_type = input_45_pad_type_0, strides = input_45_strides_0, weight = layers_5_mlp_gate_proj_weight_to_fp16_palettized, x = input_43_cast_fp16)[name = string("input_45_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2464_cast_fp16 = silu(x = input_45_cast_fp16)[name = string("op_2464_cast_fp16")];
+            string var_2470_pad_type_0 = const()[name = string("op_2470_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2470_strides_0 = const()[name = string("op_2470_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2470_pad_0 = const()[name = string("op_2470_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2470_dilations_0 = const()[name = string("op_2470_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2470_groups_0 = const()[name = string("op_2470_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_5_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(88208704))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91354496))))[name = string("layers_5_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2470_cast_fp16 = conv(dilations = var_2470_dilations_0, groups = var_2470_groups_0, pad = var_2470_pad_0, pad_type = var_2470_pad_type_0, strides = var_2470_strides_0, weight = layers_5_mlp_up_proj_weight_to_fp16_palettized, x = input_43_cast_fp16)[name = string("op_2470_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_47_cast_fp16 = mul(x = var_2464_cast_fp16, y = var_2470_cast_fp16)[name = string("input_47_cast_fp16")];
+            string hidden_states_59_pad_type_0 = const()[name = string("hidden_states_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_59_strides_0 = const()[name = string("hidden_states_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_59_pad_0 = const()[name = string("hidden_states_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_59_dilations_0 = const()[name = string("hidden_states_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_59_groups_0 = const()[name = string("hidden_states_59_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_5_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91355072))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(94500864))))[name = string("layers_5_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_59_cast_fp16 = conv(dilations = hidden_states_59_dilations_0, groups = hidden_states_59_groups_0, pad = hidden_states_59_pad_0, pad_type = hidden_states_59_pad_type_0, strides = hidden_states_59_strides_0, weight = layers_5_mlp_down_proj_weight_to_fp16_palettized, x = input_47_cast_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_47_cast_fp16 = add(x = inputs_45_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("inputs_47_cast_fp16")];
+            int32 var_2484 = const()[name = string("op_2484"), val = int32(3)];
+            int32 var_2494 = const()[name = string("op_2494"), val = int32(-2)];
+            int32 var_2502 = const()[name = string("op_2502"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_49_cast_fp16 = mul(x = inputs_47_cast_fp16, y = inputs_47_cast_fp16)[name = string("inputs_sq_49_cast_fp16")];
+            tensor<int32, [1]> variance_49_axes_0 = const()[name = string("variance_49_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_49_keep_dims_0 = const()[name = string("variance_49_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_49_cast_fp16 = reduce_mean(axes = variance_49_axes_0, keep_dims = variance_49_keep_dims_0, x = inputs_sq_49_cast_fp16)[name = string("variance_49_cast_fp16")];
+            fp16 var_2514_to_fp16 = const()[name = string("op_2514_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2515_cast_fp16 = add(x = variance_49_cast_fp16, y = var_2514_to_fp16)[name = string("op_2515_cast_fp16")];
+            fp32 var_2516_epsilon_0 = const()[name = string("op_2516_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2516_cast_fp16 = rsqrt(epsilon = var_2516_epsilon_0, x = var_2515_cast_fp16)[name = string("op_2516_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_61_cast_fp16 = mul(x = inputs_47_cast_fp16, y = var_2516_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_49_to_fp16 = const()[name = string("w_49_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(94501440)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_53_cast_fp16 = mul(x = w_49_to_fp16, y = hidden_states_61_cast_fp16)[name = string("obj_53_cast_fp16")];
+            string query_37_pad_type_0 = const()[name = string("query_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_37_strides_0 = const()[name = string("query_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_37_pad_0 = const()[name = string("query_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_37_dilations_0 = const()[name = string("query_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_37_groups_0 = const()[name = string("query_37_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_6_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(94503552))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96600768))))[name = string("layers_6_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_37_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_37_dilations_0, groups = query_37_groups_0, pad = query_37_pad_0, pad_type = query_37_pad_type_0, strides = query_37_strides_0, weight = layers_6_self_attn_q_proj_weight_to_fp16_palettized, x = obj_53_cast_fp16)[name = string("query_37_cast_fp16")];
+            string current_key_25_pad_type_0 = const()[name = string("current_key_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_25_strides_0 = const()[name = string("current_key_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_25_pad_0 = const()[name = string("current_key_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_25_dilations_0 = const()[name = string("current_key_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_25_groups_0 = const()[name = string("current_key_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96601344))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97649984))))[name = string("layers_6_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_25_cast_fp16 = conv(dilations = current_key_25_dilations_0, groups = current_key_25_groups_0, pad = current_key_25_pad_0, pad_type = current_key_25_pad_type_0, strides = current_key_25_strides_0, weight = layers_6_self_attn_k_proj_weight_to_fp16_palettized, x = obj_53_cast_fp16)[name = string("current_key_25_cast_fp16")];
+            string current_value_13_pad_type_0 = const()[name = string("current_value_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_13_strides_0 = const()[name = string("current_value_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_13_pad_0 = const()[name = string("current_value_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_13_dilations_0 = const()[name = string("current_value_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_13_groups_0 = const()[name = string("current_value_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97650560))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(98699200))))[name = string("layers_6_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_13_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_13_dilations_0, groups = current_value_13_groups_0, pad = current_value_13_pad_0, pad_type = current_value_13_pad_type_0, strides = current_value_13_strides_0, weight = layers_6_self_attn_v_proj_weight_to_fp16_palettized, x = obj_53_cast_fp16)[name = string("current_value_13_cast_fp16")];
+            tensor<int32, [4]> var_2553 = const()[name = string("op_2553"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_49_cast_fp16 = reshape(shape = var_2553, x = query_37_cast_fp16)[name = string("inputs_49_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_51_cast_fp16 = mul(x = inputs_49_cast_fp16, y = inputs_49_cast_fp16)[name = string("inputs_sq_51_cast_fp16")];
+            tensor<int32, [1]> variance_51_axes_0 = const()[name = string("variance_51_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_51_keep_dims_0 = const()[name = string("variance_51_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_51_cast_fp16 = reduce_mean(axes = variance_51_axes_0, keep_dims = variance_51_keep_dims_0, x = inputs_sq_51_cast_fp16)[name = string("variance_51_cast_fp16")];
+            fp16 var_2559_to_fp16 = const()[name = string("op_2559_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_2560_cast_fp16 = add(x = variance_51_cast_fp16, y = var_2559_to_fp16)[name = string("op_2560_cast_fp16")];
+            fp32 var_2561_epsilon_0 = const()[name = string("op_2561_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_2561_cast_fp16 = rsqrt(epsilon = var_2561_epsilon_0, x = var_2560_cast_fp16)[name = string("op_2561_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_63_cast_fp16 = mul(x = inputs_49_cast_fp16, y = var_2561_cast_fp16)[name = string("hidden_states_63_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_51_to_fp16 = const()[name = string("w_51_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(98699776)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_13_cast_fp16 = mul(x = w_51_to_fp16, y = hidden_states_63_cast_fp16)[name = string("query_normed_13_cast_fp16")];
+            tensor<int32, [4]> var_2569 = const()[name = string("op_2569"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_51_cast_fp16 = reshape(shape = var_2569, x = current_key_25_cast_fp16)[name = string("inputs_51_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_53_cast_fp16 = mul(x = inputs_51_cast_fp16, y = inputs_51_cast_fp16)[name = string("inputs_sq_53_cast_fp16")];
+            tensor<int32, [1]> variance_53_axes_0 = const()[name = string("variance_53_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_53_keep_dims_0 = const()[name = string("variance_53_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_53_cast_fp16 = reduce_mean(axes = variance_53_axes_0, keep_dims = variance_53_keep_dims_0, x = inputs_sq_53_cast_fp16)[name = string("variance_53_cast_fp16")];
+            fp16 var_2575_to_fp16 = const()[name = string("op_2575_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_2576_cast_fp16 = add(x = variance_53_cast_fp16, y = var_2575_to_fp16)[name = string("op_2576_cast_fp16")];
+            fp32 var_2577_epsilon_0 = const()[name = string("op_2577_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_2577_cast_fp16 = rsqrt(epsilon = var_2577_epsilon_0, x = var_2576_cast_fp16)[name = string("op_2577_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_65_cast_fp16 = mul(x = inputs_51_cast_fp16, y = var_2577_cast_fp16)[name = string("hidden_states_65_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_53_to_fp16 = const()[name = string("w_53_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(98700096)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_13_cast_fp16 = mul(x = w_53_to_fp16, y = hidden_states_65_cast_fp16)[name = string("current_key_normed_13_cast_fp16")];
+            tensor<int32, [4]> var_2595 = const()[name = string("op_2595"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_37_cast_fp16 = reshape(shape = var_2595, x = query_normed_13_cast_fp16)[name = string("mh_q_37_cast_fp16")];
+            tensor<int32, [4]> var_2597 = const()[name = string("op_2597"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_25_cast_fp16 = reshape(shape = var_2597, x = current_key_normed_13_cast_fp16)[name = string("mh_k_25_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2601_cast_fp16 = mul(x = mh_q_37_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2601_cast_fp16")];
+            tensor<int32, [4]> var_2606_begin_0 = const()[name = string("op_2606_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2606_end_0 = const()[name = string("op_2606_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_2606_end_mask_0 = const()[name = string("op_2606_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2606_cast_fp16 = slice_by_index(begin = var_2606_begin_0, end = var_2606_end_0, end_mask = var_2606_end_mask_0, x = mh_q_37_cast_fp16)[name = string("op_2606_cast_fp16")];
+            tensor<int32, [4]> var_2612_begin_0 = const()[name = string("op_2612_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2612_end_0 = const()[name = string("op_2612_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_2612_end_mask_0 = const()[name = string("op_2612_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2612_cast_fp16 = slice_by_index(begin = var_2612_begin_0, end = var_2612_end_0, end_mask = var_2612_end_mask_0, x = mh_q_37_cast_fp16)[name = string("op_2612_cast_fp16")];
+            fp16 const_155_promoted_to_fp16 = const()[name = string("const_155_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_2614_cast_fp16 = mul(x = var_2612_cast_fp16, y = const_155_promoted_to_fp16)[name = string("op_2614_cast_fp16")];
+            bool var_2616_interleave_0 = const()[name = string("op_2616_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_2616_cast_fp16 = concat(axis = var_2494, interleave = var_2616_interleave_0, values = (var_2614_cast_fp16, var_2606_cast_fp16))[name = string("op_2616_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2617_cast_fp16 = mul(x = var_2616_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2617_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_39_cast_fp16 = add(x = var_2601_cast_fp16, y = var_2617_cast_fp16)[name = string("mh_q_39_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2619_cast_fp16 = mul(x = mh_k_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2619_cast_fp16")];
+            tensor<int32, [4]> var_2624_begin_0 = const()[name = string("op_2624_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2624_end_0 = const()[name = string("op_2624_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_2624_end_mask_0 = const()[name = string("op_2624_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2624_cast_fp16 = slice_by_index(begin = var_2624_begin_0, end = var_2624_end_0, end_mask = var_2624_end_mask_0, x = mh_k_25_cast_fp16)[name = string("op_2624_cast_fp16")];
+            tensor<int32, [4]> var_2630_begin_0 = const()[name = string("op_2630_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2630_end_0 = const()[name = string("op_2630_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_2630_end_mask_0 = const()[name = string("op_2630_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2630_cast_fp16 = slice_by_index(begin = var_2630_begin_0, end = var_2630_end_0, end_mask = var_2630_end_mask_0, x = mh_k_25_cast_fp16)[name = string("op_2630_cast_fp16")];
+            fp16 const_158_promoted_to_fp16 = const()[name = string("const_158_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_2632_cast_fp16 = mul(x = var_2630_cast_fp16, y = const_158_promoted_to_fp16)[name = string("op_2632_cast_fp16")];
+            bool var_2634_interleave_0 = const()[name = string("op_2634_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_2634_cast_fp16 = concat(axis = var_2494, interleave = var_2634_interleave_0, values = (var_2632_cast_fp16, var_2624_cast_fp16))[name = string("op_2634_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2635_cast_fp16 = mul(x = var_2634_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2635_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_27_cast_fp16 = add(x = var_2619_cast_fp16, y = var_2635_cast_fp16)[name = string("mh_k_27_cast_fp16")];
+            tensor<int32, [4]> var_2639 = const()[name = string("op_2639"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_27_cast_fp16 = reshape(shape = var_2639, x = mh_k_27_cast_fp16)[name = string("current_key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2646_cast_fp16 = mul(x = var_101_cast_fp16_6, y = var_323_cast_fp16)[name = string("op_2646_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2647_cast_fp16 = mul(x = current_key_27_cast_fp16, y = var_321_cast_fp16)[name = string("op_2647_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_39_cast_fp16 = add(x = var_2646_cast_fp16, y = var_2647_cast_fp16)[name = string("key_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2650_cast_fp16 = mul(x = var_132_cast_fp16_6, y = var_323_cast_fp16)[name = string("op_2650_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2651_cast_fp16 = mul(x = current_value_13_cast_fp16, y = var_321_cast_fp16)[name = string("op_2651_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_25_cast_fp16 = add(x = var_2650_cast_fp16, y = var_2651_cast_fp16)[name = string("value_25_cast_fp16")];
+            tensor<int32, [4]> var_2655 = const()[name = string("op_2655"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_25_cast_fp16 = reshape(shape = var_2655, x = key_39_cast_fp16)[name = string("key_heads_25_cast_fp16")];
+            tensor<int32, [4]> var_2657 = const()[name = string("op_2657"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_25_cast_fp16 = reshape(shape = var_2657, x = value_25_cast_fp16)[name = string("value_heads_25_cast_fp16")];
+            tensor<int32, [4]> var_2660_begin_0 = const()[name = string("op_2660_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2660_end_0 = const()[name = string("op_2660_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2660_end_mask_0 = const()[name = string("op_2660_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2660_cast_fp16 = slice_by_index(begin = var_2660_begin_0, end = var_2660_end_0, end_mask = var_2660_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2660_cast_fp16")];
+            tensor<int32, [4]> var_2664_begin_0 = const()[name = string("op_2664_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2664_end_0 = const()[name = string("op_2664_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2664_end_mask_0 = const()[name = string("op_2664_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2664_cast_fp16 = slice_by_index(begin = var_2664_begin_0, end = var_2664_end_0, end_mask = var_2664_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2664_cast_fp16")];
+            tensor<int32, [4]> var_2676_begin_0 = const()[name = string("op_2676_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2676_end_0 = const()[name = string("op_2676_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2676_end_mask_0 = const()[name = string("op_2676_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2676_cast_fp16 = slice_by_index(begin = var_2676_begin_0, end = var_2676_end_0, end_mask = var_2676_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2676_cast_fp16")];
+            tensor<int32, [4]> var_2680_begin_0 = const()[name = string("op_2680_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2680_end_0 = const()[name = string("op_2680_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2680_end_mask_0 = const()[name = string("op_2680_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2680_cast_fp16 = slice_by_index(begin = var_2680_begin_0, end = var_2680_end_0, end_mask = var_2680_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2680_cast_fp16")];
+            tensor<int32, [4]> var_2692_begin_0 = const()[name = string("op_2692_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2692_end_0 = const()[name = string("op_2692_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2692_end_mask_0 = const()[name = string("op_2692_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2692_cast_fp16 = slice_by_index(begin = var_2692_begin_0, end = var_2692_end_0, end_mask = var_2692_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2692_cast_fp16")];
+            tensor<int32, [4]> var_2696_begin_0 = const()[name = string("op_2696_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2696_end_0 = const()[name = string("op_2696_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2696_end_mask_0 = const()[name = string("op_2696_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2696_cast_fp16 = slice_by_index(begin = var_2696_begin_0, end = var_2696_end_0, end_mask = var_2696_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2696_cast_fp16")];
+            tensor<int32, [4]> var_2708_begin_0 = const()[name = string("op_2708_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2708_end_0 = const()[name = string("op_2708_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2708_end_mask_0 = const()[name = string("op_2708_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2708_cast_fp16 = slice_by_index(begin = var_2708_begin_0, end = var_2708_end_0, end_mask = var_2708_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2708_cast_fp16")];
+            tensor<int32, [4]> var_2712_begin_0 = const()[name = string("op_2712_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2712_end_0 = const()[name = string("op_2712_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2712_end_mask_0 = const()[name = string("op_2712_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2712_cast_fp16 = slice_by_index(begin = var_2712_begin_0, end = var_2712_end_0, end_mask = var_2712_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2712_cast_fp16")];
+            tensor<int32, [4]> var_2724_begin_0 = const()[name = string("op_2724_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2724_end_0 = const()[name = string("op_2724_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2724_end_mask_0 = const()[name = string("op_2724_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2724_cast_fp16 = slice_by_index(begin = var_2724_begin_0, end = var_2724_end_0, end_mask = var_2724_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2724_cast_fp16")];
+            tensor<int32, [4]> var_2728_begin_0 = const()[name = string("op_2728_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2728_end_0 = const()[name = string("op_2728_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2728_end_mask_0 = const()[name = string("op_2728_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2728_cast_fp16 = slice_by_index(begin = var_2728_begin_0, end = var_2728_end_0, end_mask = var_2728_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2728_cast_fp16")];
+            tensor<int32, [4]> var_2740_begin_0 = const()[name = string("op_2740_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2740_end_0 = const()[name = string("op_2740_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2740_end_mask_0 = const()[name = string("op_2740_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2740_cast_fp16 = slice_by_index(begin = var_2740_begin_0, end = var_2740_end_0, end_mask = var_2740_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2740_cast_fp16")];
+            tensor<int32, [4]> var_2744_begin_0 = const()[name = string("op_2744_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2744_end_0 = const()[name = string("op_2744_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2744_end_mask_0 = const()[name = string("op_2744_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2744_cast_fp16 = slice_by_index(begin = var_2744_begin_0, end = var_2744_end_0, end_mask = var_2744_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2744_cast_fp16")];
+            tensor<int32, [4]> var_2756_begin_0 = const()[name = string("op_2756_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2756_end_0 = const()[name = string("op_2756_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2756_end_mask_0 = const()[name = string("op_2756_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2756_cast_fp16 = slice_by_index(begin = var_2756_begin_0, end = var_2756_end_0, end_mask = var_2756_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2756_cast_fp16")];
+            tensor<int32, [4]> var_2760_begin_0 = const()[name = string("op_2760_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2760_end_0 = const()[name = string("op_2760_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2760_end_mask_0 = const()[name = string("op_2760_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2760_cast_fp16 = slice_by_index(begin = var_2760_begin_0, end = var_2760_end_0, end_mask = var_2760_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2760_cast_fp16")];
+            tensor<int32, [4]> var_2772_begin_0 = const()[name = string("op_2772_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2772_end_0 = const()[name = string("op_2772_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2772_end_mask_0 = const()[name = string("op_2772_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2772_cast_fp16 = slice_by_index(begin = var_2772_begin_0, end = var_2772_end_0, end_mask = var_2772_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2772_cast_fp16")];
+            tensor<int32, [4]> var_2776_begin_0 = const()[name = string("op_2776_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2776_end_0 = const()[name = string("op_2776_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2776_end_mask_0 = const()[name = string("op_2776_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2776_cast_fp16 = slice_by_index(begin = var_2776_begin_0, end = var_2776_end_0, end_mask = var_2776_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2776_cast_fp16")];
+            bool key_heads_27_interleave_0 = const()[name = string("key_heads_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_27_cast_fp16 = concat(axis = var_2502, interleave = key_heads_27_interleave_0, values = (var_2660_cast_fp16, var_2660_cast_fp16, var_2676_cast_fp16, var_2676_cast_fp16, var_2692_cast_fp16, var_2692_cast_fp16, var_2708_cast_fp16, var_2708_cast_fp16, var_2724_cast_fp16, var_2724_cast_fp16, var_2740_cast_fp16, var_2740_cast_fp16, var_2756_cast_fp16, var_2756_cast_fp16, var_2772_cast_fp16, var_2772_cast_fp16))[name = string("key_heads_27_cast_fp16")];
+            bool value_heads_27_interleave_0 = const()[name = string("value_heads_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_27_cast_fp16 = concat(axis = var_2502, interleave = value_heads_27_interleave_0, values = (var_2664_cast_fp16, var_2664_cast_fp16, var_2680_cast_fp16, var_2680_cast_fp16, var_2696_cast_fp16, var_2696_cast_fp16, var_2712_cast_fp16, var_2712_cast_fp16, var_2728_cast_fp16, var_2728_cast_fp16, var_2744_cast_fp16, var_2744_cast_fp16, var_2760_cast_fp16, var_2760_cast_fp16, var_2776_cast_fp16, var_2776_cast_fp16))[name = string("value_heads_27_cast_fp16")];
+            fp16 var_2799_to_fp16 = const()[name = string("op_2799_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_2800_cast_fp16 = mul(x = mh_q_39_cast_fp16, y = var_2799_to_fp16)[name = string("op_2800_cast_fp16")];
+            bool mh_w_25_transpose_x_0 = const()[name = string("mh_w_25_transpose_x_0"), val = bool(true)];
+            bool mh_w_25_transpose_y_0 = const()[name = string("mh_w_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_25_cast_fp16 = matmul(transpose_x = mh_w_25_transpose_x_0, transpose_y = mh_w_25_transpose_y_0, x = var_2800_cast_fp16, y = key_heads_27_cast_fp16)[name = string("mh_w_25_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_27_cast_fp16 = add(x = mh_w_25_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_27_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_2812_cast_fp16 = softmax(axis = var_2484, x = mh_w_27_cast_fp16)[name = string("op_2812_cast_fp16")];
+            bool attn_13_transpose_x_0 = const()[name = string("attn_13_transpose_x_0"), val = bool(false)];
+            bool attn_13_transpose_y_0 = const()[name = string("attn_13_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_13_cast_fp16 = matmul(transpose_x = attn_13_transpose_x_0, transpose_y = attn_13_transpose_y_0, x = value_heads_27_cast_fp16, y = var_2812_cast_fp16)[name = string("attn_13_cast_fp16")];
+            tensor<int32, [4]> var_2817 = const()[name = string("op_2817"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_49_cast_fp16 = reshape(shape = var_2817, x = attn_13_cast_fp16)[name = string("input_49_cast_fp16")];
+            string obj_59_pad_type_0 = const()[name = string("obj_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_59_strides_0 = const()[name = string("obj_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_59_pad_0 = const()[name = string("obj_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_59_dilations_0 = const()[name = string("obj_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_59_groups_0 = const()[name = string("obj_59_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_6_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(98700416))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100797632))))[name = string("layers_6_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_59_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_59_dilations_0, groups = obj_59_groups_0, pad = obj_59_pad_0, pad_type = obj_59_pad_type_0, strides = obj_59_strides_0, weight = layers_6_self_attn_o_proj_weight_to_fp16_palettized, x = input_49_cast_fp16)[name = string("obj_59_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_53_cast_fp16 = add(x = inputs_47_cast_fp16, y = obj_59_cast_fp16)[name = string("inputs_53_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_55_cast_fp16 = mul(x = inputs_53_cast_fp16, y = inputs_53_cast_fp16)[name = string("inputs_sq_55_cast_fp16")];
+            tensor<int32, [1]> variance_55_axes_0 = const()[name = string("variance_55_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_55_keep_dims_0 = const()[name = string("variance_55_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_55_cast_fp16 = reduce_mean(axes = variance_55_axes_0, keep_dims = variance_55_keep_dims_0, x = inputs_sq_55_cast_fp16)[name = string("variance_55_cast_fp16")];
+            fp16 var_2835_to_fp16 = const()[name = string("op_2835_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2836_cast_fp16 = add(x = variance_55_cast_fp16, y = var_2835_to_fp16)[name = string("op_2836_cast_fp16")];
+            fp32 var_2837_epsilon_0 = const()[name = string("op_2837_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2837_cast_fp16 = rsqrt(epsilon = var_2837_epsilon_0, x = var_2836_cast_fp16)[name = string("op_2837_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_67_cast_fp16 = mul(x = inputs_53_cast_fp16, y = var_2837_cast_fp16)[name = string("hidden_states_67_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_55_to_fp16 = const()[name = string("w_55_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100798208)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_51_cast_fp16 = mul(x = w_55_to_fp16, y = hidden_states_67_cast_fp16)[name = string("input_51_cast_fp16")];
+            string input_53_pad_type_0 = const()[name = string("input_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_53_strides_0 = const()[name = string("input_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_53_pad_0 = const()[name = string("input_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_53_dilations_0 = const()[name = string("input_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_53_groups_0 = const()[name = string("input_53_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_6_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100800320))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103946112))))[name = string("layers_6_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_53_cast_fp16 = conv(dilations = input_53_dilations_0, groups = input_53_groups_0, pad = input_53_pad_0, pad_type = input_53_pad_type_0, strides = input_53_strides_0, weight = layers_6_mlp_gate_proj_weight_to_fp16_palettized, x = input_51_cast_fp16)[name = string("input_53_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2851_cast_fp16 = silu(x = input_53_cast_fp16)[name = string("op_2851_cast_fp16")];
+            string var_2857_pad_type_0 = const()[name = string("op_2857_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2857_strides_0 = const()[name = string("op_2857_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2857_pad_0 = const()[name = string("op_2857_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2857_dilations_0 = const()[name = string("op_2857_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2857_groups_0 = const()[name = string("op_2857_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_6_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103946688))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(107092480))))[name = string("layers_6_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2857_cast_fp16 = conv(dilations = var_2857_dilations_0, groups = var_2857_groups_0, pad = var_2857_pad_0, pad_type = var_2857_pad_type_0, strides = var_2857_strides_0, weight = layers_6_mlp_up_proj_weight_to_fp16_palettized, x = input_51_cast_fp16)[name = string("op_2857_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_55_cast_fp16 = mul(x = var_2851_cast_fp16, y = var_2857_cast_fp16)[name = string("input_55_cast_fp16")];
+            string hidden_states_69_pad_type_0 = const()[name = string("hidden_states_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_69_strides_0 = const()[name = string("hidden_states_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_69_pad_0 = const()[name = string("hidden_states_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_69_dilations_0 = const()[name = string("hidden_states_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_69_groups_0 = const()[name = string("hidden_states_69_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_6_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(107093056))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110238848))))[name = string("layers_6_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_69_cast_fp16 = conv(dilations = hidden_states_69_dilations_0, groups = hidden_states_69_groups_0, pad = hidden_states_69_pad_0, pad_type = hidden_states_69_pad_type_0, strides = hidden_states_69_strides_0, weight = layers_6_mlp_down_proj_weight_to_fp16_palettized, x = input_55_cast_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_55_cast_fp16 = add(x = inputs_53_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("inputs_55_cast_fp16")];
+            int32 var_2871 = const()[name = string("op_2871"), val = int32(3)];
+            int32 var_2881 = const()[name = string("op_2881"), val = int32(-2)];
+            int32 var_2889 = const()[name = string("op_2889"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_57_cast_fp16 = mul(x = inputs_55_cast_fp16, y = inputs_55_cast_fp16)[name = string("inputs_sq_57_cast_fp16")];
+            tensor<int32, [1]> variance_57_axes_0 = const()[name = string("variance_57_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_57_keep_dims_0 = const()[name = string("variance_57_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_57_cast_fp16 = reduce_mean(axes = variance_57_axes_0, keep_dims = variance_57_keep_dims_0, x = inputs_sq_57_cast_fp16)[name = string("variance_57_cast_fp16")];
+            fp16 var_2901_to_fp16 = const()[name = string("op_2901_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2902_cast_fp16 = add(x = variance_57_cast_fp16, y = var_2901_to_fp16)[name = string("op_2902_cast_fp16")];
+            fp32 var_2903_epsilon_0 = const()[name = string("op_2903_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2903_cast_fp16 = rsqrt(epsilon = var_2903_epsilon_0, x = var_2902_cast_fp16)[name = string("op_2903_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_71_cast_fp16 = mul(x = inputs_55_cast_fp16, y = var_2903_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_57_to_fp16 = const()[name = string("w_57_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110239424)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_61_cast_fp16 = mul(x = w_57_to_fp16, y = hidden_states_71_cast_fp16)[name = string("obj_61_cast_fp16")];
+            string query_43_pad_type_0 = const()[name = string("query_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_43_strides_0 = const()[name = string("query_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_43_pad_0 = const()[name = string("query_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_43_dilations_0 = const()[name = string("query_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_43_groups_0 = const()[name = string("query_43_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_7_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110241536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112338752))))[name = string("layers_7_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_43_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_43_dilations_0, groups = query_43_groups_0, pad = query_43_pad_0, pad_type = query_43_pad_type_0, strides = query_43_strides_0, weight = layers_7_self_attn_q_proj_weight_to_fp16_palettized, x = obj_61_cast_fp16)[name = string("query_43_cast_fp16")];
+            string current_key_29_pad_type_0 = const()[name = string("current_key_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_29_strides_0 = const()[name = string("current_key_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_29_pad_0 = const()[name = string("current_key_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_29_dilations_0 = const()[name = string("current_key_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_29_groups_0 = const()[name = string("current_key_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112339328))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113387968))))[name = string("layers_7_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_29_cast_fp16 = conv(dilations = current_key_29_dilations_0, groups = current_key_29_groups_0, pad = current_key_29_pad_0, pad_type = current_key_29_pad_type_0, strides = current_key_29_strides_0, weight = layers_7_self_attn_k_proj_weight_to_fp16_palettized, x = obj_61_cast_fp16)[name = string("current_key_29_cast_fp16")];
+            string current_value_15_pad_type_0 = const()[name = string("current_value_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_15_strides_0 = const()[name = string("current_value_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_15_pad_0 = const()[name = string("current_value_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_15_dilations_0 = const()[name = string("current_value_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_15_groups_0 = const()[name = string("current_value_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113388544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114437184))))[name = string("layers_7_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_15_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_15_dilations_0, groups = current_value_15_groups_0, pad = current_value_15_pad_0, pad_type = current_value_15_pad_type_0, strides = current_value_15_strides_0, weight = layers_7_self_attn_v_proj_weight_to_fp16_palettized, x = obj_61_cast_fp16)[name = string("current_value_15_cast_fp16")];
+            tensor<int32, [4]> var_2940 = const()[name = string("op_2940"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_57_cast_fp16 = reshape(shape = var_2940, x = query_43_cast_fp16)[name = string("inputs_57_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_59_cast_fp16 = mul(x = inputs_57_cast_fp16, y = inputs_57_cast_fp16)[name = string("inputs_sq_59_cast_fp16")];
+            tensor<int32, [1]> variance_59_axes_0 = const()[name = string("variance_59_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_59_keep_dims_0 = const()[name = string("variance_59_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_59_cast_fp16 = reduce_mean(axes = variance_59_axes_0, keep_dims = variance_59_keep_dims_0, x = inputs_sq_59_cast_fp16)[name = string("variance_59_cast_fp16")];
+            fp16 var_2946_to_fp16 = const()[name = string("op_2946_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_2947_cast_fp16 = add(x = variance_59_cast_fp16, y = var_2946_to_fp16)[name = string("op_2947_cast_fp16")];
+            fp32 var_2948_epsilon_0 = const()[name = string("op_2948_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_2948_cast_fp16 = rsqrt(epsilon = var_2948_epsilon_0, x = var_2947_cast_fp16)[name = string("op_2948_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_73_cast_fp16 = mul(x = inputs_57_cast_fp16, y = var_2948_cast_fp16)[name = string("hidden_states_73_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_59_to_fp16 = const()[name = string("w_59_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114437760)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_15_cast_fp16 = mul(x = w_59_to_fp16, y = hidden_states_73_cast_fp16)[name = string("query_normed_15_cast_fp16")];
+            tensor<int32, [4]> var_2956 = const()[name = string("op_2956"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_59_cast_fp16 = reshape(shape = var_2956, x = current_key_29_cast_fp16)[name = string("inputs_59_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_61_cast_fp16 = mul(x = inputs_59_cast_fp16, y = inputs_59_cast_fp16)[name = string("inputs_sq_61_cast_fp16")];
+            tensor<int32, [1]> variance_61_axes_0 = const()[name = string("variance_61_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_61_keep_dims_0 = const()[name = string("variance_61_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_61_cast_fp16 = reduce_mean(axes = variance_61_axes_0, keep_dims = variance_61_keep_dims_0, x = inputs_sq_61_cast_fp16)[name = string("variance_61_cast_fp16")];
+            fp16 var_2962_to_fp16 = const()[name = string("op_2962_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_2963_cast_fp16 = add(x = variance_61_cast_fp16, y = var_2962_to_fp16)[name = string("op_2963_cast_fp16")];
+            fp32 var_2964_epsilon_0 = const()[name = string("op_2964_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_2964_cast_fp16 = rsqrt(epsilon = var_2964_epsilon_0, x = var_2963_cast_fp16)[name = string("op_2964_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_75_cast_fp16 = mul(x = inputs_59_cast_fp16, y = var_2964_cast_fp16)[name = string("hidden_states_75_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_61_to_fp16 = const()[name = string("w_61_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114438080)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_15_cast_fp16 = mul(x = w_61_to_fp16, y = hidden_states_75_cast_fp16)[name = string("current_key_normed_15_cast_fp16")];
+            tensor<int32, [4]> var_2982 = const()[name = string("op_2982"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_43_cast_fp16 = reshape(shape = var_2982, x = query_normed_15_cast_fp16)[name = string("mh_q_43_cast_fp16")];
+            tensor<int32, [4]> var_2984 = const()[name = string("op_2984"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_29_cast_fp16 = reshape(shape = var_2984, x = current_key_normed_15_cast_fp16)[name = string("mh_k_29_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2988_cast_fp16 = mul(x = mh_q_43_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2988_cast_fp16")];
+            tensor<int32, [4]> var_2993_begin_0 = const()[name = string("op_2993_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2993_end_0 = const()[name = string("op_2993_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_2993_end_mask_0 = const()[name = string("op_2993_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2993_cast_fp16 = slice_by_index(begin = var_2993_begin_0, end = var_2993_end_0, end_mask = var_2993_end_mask_0, x = mh_q_43_cast_fp16)[name = string("op_2993_cast_fp16")];
+            tensor<int32, [4]> var_2999_begin_0 = const()[name = string("op_2999_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2999_end_0 = const()[name = string("op_2999_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_2999_end_mask_0 = const()[name = string("op_2999_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2999_cast_fp16 = slice_by_index(begin = var_2999_begin_0, end = var_2999_end_0, end_mask = var_2999_end_mask_0, x = mh_q_43_cast_fp16)[name = string("op_2999_cast_fp16")];
+            fp16 const_178_promoted_to_fp16 = const()[name = string("const_178_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_3001_cast_fp16 = mul(x = var_2999_cast_fp16, y = const_178_promoted_to_fp16)[name = string("op_3001_cast_fp16")];
+            bool var_3003_interleave_0 = const()[name = string("op_3003_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_3003_cast_fp16 = concat(axis = var_2881, interleave = var_3003_interleave_0, values = (var_3001_cast_fp16, var_2993_cast_fp16))[name = string("op_3003_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3004_cast_fp16 = mul(x = var_3003_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3004_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_45_cast_fp16 = add(x = var_2988_cast_fp16, y = var_3004_cast_fp16)[name = string("mh_q_45_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3006_cast_fp16 = mul(x = mh_k_29_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3006_cast_fp16")];
+            tensor<int32, [4]> var_3011_begin_0 = const()[name = string("op_3011_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3011_end_0 = const()[name = string("op_3011_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_3011_end_mask_0 = const()[name = string("op_3011_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3011_cast_fp16 = slice_by_index(begin = var_3011_begin_0, end = var_3011_end_0, end_mask = var_3011_end_mask_0, x = mh_k_29_cast_fp16)[name = string("op_3011_cast_fp16")];
+            tensor<int32, [4]> var_3017_begin_0 = const()[name = string("op_3017_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3017_end_0 = const()[name = string("op_3017_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_3017_end_mask_0 = const()[name = string("op_3017_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3017_cast_fp16 = slice_by_index(begin = var_3017_begin_0, end = var_3017_end_0, end_mask = var_3017_end_mask_0, x = mh_k_29_cast_fp16)[name = string("op_3017_cast_fp16")];
+            fp16 const_181_promoted_to_fp16 = const()[name = string("const_181_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_3019_cast_fp16 = mul(x = var_3017_cast_fp16, y = const_181_promoted_to_fp16)[name = string("op_3019_cast_fp16")];
+            bool var_3021_interleave_0 = const()[name = string("op_3021_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_3021_cast_fp16 = concat(axis = var_2881, interleave = var_3021_interleave_0, values = (var_3019_cast_fp16, var_3011_cast_fp16))[name = string("op_3021_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3022_cast_fp16 = mul(x = var_3021_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3022_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_31_cast_fp16 = add(x = var_3006_cast_fp16, y = var_3022_cast_fp16)[name = string("mh_k_31_cast_fp16")];
+            tensor<int32, [4]> var_3026 = const()[name = string("op_3026"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_31_cast_fp16 = reshape(shape = var_3026, x = mh_k_31_cast_fp16)[name = string("current_key_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3033_cast_fp16 = mul(x = var_101_cast_fp16_7, y = var_323_cast_fp16)[name = string("op_3033_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3034_cast_fp16 = mul(x = current_key_31_cast_fp16, y = var_321_cast_fp16)[name = string("op_3034_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_45_cast_fp16 = add(x = var_3033_cast_fp16, y = var_3034_cast_fp16)[name = string("key_45_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3037_cast_fp16 = mul(x = var_132_cast_fp16_7, y = var_323_cast_fp16)[name = string("op_3037_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3038_cast_fp16 = mul(x = current_value_15_cast_fp16, y = var_321_cast_fp16)[name = string("op_3038_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_29_cast_fp16 = add(x = var_3037_cast_fp16, y = var_3038_cast_fp16)[name = string("value_29_cast_fp16")];
+            tensor<int32, [4]> var_3042 = const()[name = string("op_3042"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_29_cast_fp16 = reshape(shape = var_3042, x = key_45_cast_fp16)[name = string("key_heads_29_cast_fp16")];
+            tensor<int32, [4]> var_3044 = const()[name = string("op_3044"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_29_cast_fp16 = reshape(shape = var_3044, x = value_29_cast_fp16)[name = string("value_heads_29_cast_fp16")];
+            tensor<int32, [4]> var_3047_begin_0 = const()[name = string("op_3047_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3047_end_0 = const()[name = string("op_3047_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3047_end_mask_0 = const()[name = string("op_3047_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3047_cast_fp16 = slice_by_index(begin = var_3047_begin_0, end = var_3047_end_0, end_mask = var_3047_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3047_cast_fp16")];
+            tensor<int32, [4]> var_3051_begin_0 = const()[name = string("op_3051_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3051_end_0 = const()[name = string("op_3051_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3051_end_mask_0 = const()[name = string("op_3051_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3051_cast_fp16 = slice_by_index(begin = var_3051_begin_0, end = var_3051_end_0, end_mask = var_3051_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3051_cast_fp16")];
+            tensor<int32, [4]> var_3063_begin_0 = const()[name = string("op_3063_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3063_end_0 = const()[name = string("op_3063_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3063_end_mask_0 = const()[name = string("op_3063_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3063_cast_fp16 = slice_by_index(begin = var_3063_begin_0, end = var_3063_end_0, end_mask = var_3063_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3063_cast_fp16")];
+            tensor<int32, [4]> var_3067_begin_0 = const()[name = string("op_3067_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3067_end_0 = const()[name = string("op_3067_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3067_end_mask_0 = const()[name = string("op_3067_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3067_cast_fp16 = slice_by_index(begin = var_3067_begin_0, end = var_3067_end_0, end_mask = var_3067_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3067_cast_fp16")];
+            tensor<int32, [4]> var_3079_begin_0 = const()[name = string("op_3079_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3079_end_0 = const()[name = string("op_3079_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3079_end_mask_0 = const()[name = string("op_3079_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3079_cast_fp16 = slice_by_index(begin = var_3079_begin_0, end = var_3079_end_0, end_mask = var_3079_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3079_cast_fp16")];
+            tensor<int32, [4]> var_3083_begin_0 = const()[name = string("op_3083_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3083_end_0 = const()[name = string("op_3083_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3083_end_mask_0 = const()[name = string("op_3083_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3083_cast_fp16 = slice_by_index(begin = var_3083_begin_0, end = var_3083_end_0, end_mask = var_3083_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3083_cast_fp16")];
+            tensor<int32, [4]> var_3095_begin_0 = const()[name = string("op_3095_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3095_end_0 = const()[name = string("op_3095_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3095_end_mask_0 = const()[name = string("op_3095_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3095_cast_fp16 = slice_by_index(begin = var_3095_begin_0, end = var_3095_end_0, end_mask = var_3095_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3095_cast_fp16")];
+            tensor<int32, [4]> var_3099_begin_0 = const()[name = string("op_3099_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3099_end_0 = const()[name = string("op_3099_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3099_end_mask_0 = const()[name = string("op_3099_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3099_cast_fp16 = slice_by_index(begin = var_3099_begin_0, end = var_3099_end_0, end_mask = var_3099_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3099_cast_fp16")];
+            tensor<int32, [4]> var_3111_begin_0 = const()[name = string("op_3111_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3111_end_0 = const()[name = string("op_3111_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3111_end_mask_0 = const()[name = string("op_3111_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3111_cast_fp16 = slice_by_index(begin = var_3111_begin_0, end = var_3111_end_0, end_mask = var_3111_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3111_cast_fp16")];
+            tensor<int32, [4]> var_3115_begin_0 = const()[name = string("op_3115_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3115_end_0 = const()[name = string("op_3115_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3115_end_mask_0 = const()[name = string("op_3115_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3115_cast_fp16 = slice_by_index(begin = var_3115_begin_0, end = var_3115_end_0, end_mask = var_3115_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3115_cast_fp16")];
+            tensor<int32, [4]> var_3127_begin_0 = const()[name = string("op_3127_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3127_end_0 = const()[name = string("op_3127_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3127_end_mask_0 = const()[name = string("op_3127_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3127_cast_fp16 = slice_by_index(begin = var_3127_begin_0, end = var_3127_end_0, end_mask = var_3127_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3127_cast_fp16")];
+            tensor<int32, [4]> var_3131_begin_0 = const()[name = string("op_3131_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3131_end_0 = const()[name = string("op_3131_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3131_end_mask_0 = const()[name = string("op_3131_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3131_cast_fp16 = slice_by_index(begin = var_3131_begin_0, end = var_3131_end_0, end_mask = var_3131_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3131_cast_fp16")];
+            tensor<int32, [4]> var_3143_begin_0 = const()[name = string("op_3143_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3143_end_0 = const()[name = string("op_3143_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3143_end_mask_0 = const()[name = string("op_3143_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3143_cast_fp16 = slice_by_index(begin = var_3143_begin_0, end = var_3143_end_0, end_mask = var_3143_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3143_cast_fp16")];
+            tensor<int32, [4]> var_3147_begin_0 = const()[name = string("op_3147_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3147_end_0 = const()[name = string("op_3147_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3147_end_mask_0 = const()[name = string("op_3147_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3147_cast_fp16 = slice_by_index(begin = var_3147_begin_0, end = var_3147_end_0, end_mask = var_3147_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3147_cast_fp16")];
+            tensor<int32, [4]> var_3159_begin_0 = const()[name = string("op_3159_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3159_end_0 = const()[name = string("op_3159_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3159_end_mask_0 = const()[name = string("op_3159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3159_cast_fp16 = slice_by_index(begin = var_3159_begin_0, end = var_3159_end_0, end_mask = var_3159_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3159_cast_fp16")];
+            tensor<int32, [4]> var_3163_begin_0 = const()[name = string("op_3163_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3163_end_0 = const()[name = string("op_3163_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3163_end_mask_0 = const()[name = string("op_3163_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3163_cast_fp16 = slice_by_index(begin = var_3163_begin_0, end = var_3163_end_0, end_mask = var_3163_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3163_cast_fp16")];
+            bool key_heads_31_interleave_0 = const()[name = string("key_heads_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_31_cast_fp16 = concat(axis = var_2889, interleave = key_heads_31_interleave_0, values = (var_3047_cast_fp16, var_3047_cast_fp16, var_3063_cast_fp16, var_3063_cast_fp16, var_3079_cast_fp16, var_3079_cast_fp16, var_3095_cast_fp16, var_3095_cast_fp16, var_3111_cast_fp16, var_3111_cast_fp16, var_3127_cast_fp16, var_3127_cast_fp16, var_3143_cast_fp16, var_3143_cast_fp16, var_3159_cast_fp16, var_3159_cast_fp16))[name = string("key_heads_31_cast_fp16")];
+            bool value_heads_31_interleave_0 = const()[name = string("value_heads_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_31_cast_fp16 = concat(axis = var_2889, interleave = value_heads_31_interleave_0, values = (var_3051_cast_fp16, var_3051_cast_fp16, var_3067_cast_fp16, var_3067_cast_fp16, var_3083_cast_fp16, var_3083_cast_fp16, var_3099_cast_fp16, var_3099_cast_fp16, var_3115_cast_fp16, var_3115_cast_fp16, var_3131_cast_fp16, var_3131_cast_fp16, var_3147_cast_fp16, var_3147_cast_fp16, var_3163_cast_fp16, var_3163_cast_fp16))[name = string("value_heads_31_cast_fp16")];
+            fp16 var_3186_to_fp16 = const()[name = string("op_3186_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_3187_cast_fp16 = mul(x = mh_q_45_cast_fp16, y = var_3186_to_fp16)[name = string("op_3187_cast_fp16")];
+            bool mh_w_29_transpose_x_0 = const()[name = string("mh_w_29_transpose_x_0"), val = bool(true)];
+            bool mh_w_29_transpose_y_0 = const()[name = string("mh_w_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_29_cast_fp16 = matmul(transpose_x = mh_w_29_transpose_x_0, transpose_y = mh_w_29_transpose_y_0, x = var_3187_cast_fp16, y = key_heads_31_cast_fp16)[name = string("mh_w_29_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_31_cast_fp16 = add(x = mh_w_29_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_31_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_3199_cast_fp16 = softmax(axis = var_2871, x = mh_w_31_cast_fp16)[name = string("op_3199_cast_fp16")];
+            bool attn_15_transpose_x_0 = const()[name = string("attn_15_transpose_x_0"), val = bool(false)];
+            bool attn_15_transpose_y_0 = const()[name = string("attn_15_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_15_cast_fp16 = matmul(transpose_x = attn_15_transpose_x_0, transpose_y = attn_15_transpose_y_0, x = value_heads_31_cast_fp16, y = var_3199_cast_fp16)[name = string("attn_15_cast_fp16")];
+            tensor<int32, [4]> var_3204 = const()[name = string("op_3204"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_57_cast_fp16 = reshape(shape = var_3204, x = attn_15_cast_fp16)[name = string("input_57_cast_fp16")];
+            string obj_67_pad_type_0 = const()[name = string("obj_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_67_strides_0 = const()[name = string("obj_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_67_pad_0 = const()[name = string("obj_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_67_dilations_0 = const()[name = string("obj_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_67_groups_0 = const()[name = string("obj_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_7_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114438400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116535616))))[name = string("layers_7_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_67_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_67_dilations_0, groups = obj_67_groups_0, pad = obj_67_pad_0, pad_type = obj_67_pad_type_0, strides = obj_67_strides_0, weight = layers_7_self_attn_o_proj_weight_to_fp16_palettized, x = input_57_cast_fp16)[name = string("obj_67_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_61_cast_fp16 = add(x = inputs_55_cast_fp16, y = obj_67_cast_fp16)[name = string("inputs_61_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_63_cast_fp16 = mul(x = inputs_61_cast_fp16, y = inputs_61_cast_fp16)[name = string("inputs_sq_63_cast_fp16")];
+            tensor<int32, [1]> variance_63_axes_0 = const()[name = string("variance_63_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_63_keep_dims_0 = const()[name = string("variance_63_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_63_cast_fp16 = reduce_mean(axes = variance_63_axes_0, keep_dims = variance_63_keep_dims_0, x = inputs_sq_63_cast_fp16)[name = string("variance_63_cast_fp16")];
+            fp16 var_3222_to_fp16 = const()[name = string("op_3222_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3223_cast_fp16 = add(x = variance_63_cast_fp16, y = var_3222_to_fp16)[name = string("op_3223_cast_fp16")];
+            fp32 var_3224_epsilon_0 = const()[name = string("op_3224_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3224_cast_fp16 = rsqrt(epsilon = var_3224_epsilon_0, x = var_3223_cast_fp16)[name = string("op_3224_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_77_cast_fp16 = mul(x = inputs_61_cast_fp16, y = var_3224_cast_fp16)[name = string("hidden_states_77_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_63_to_fp16 = const()[name = string("w_63_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116536192)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_59_cast_fp16 = mul(x = w_63_to_fp16, y = hidden_states_77_cast_fp16)[name = string("input_59_cast_fp16")];
+            string input_61_pad_type_0 = const()[name = string("input_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_61_strides_0 = const()[name = string("input_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_61_pad_0 = const()[name = string("input_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_61_dilations_0 = const()[name = string("input_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_61_groups_0 = const()[name = string("input_61_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_7_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116538304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(119684096))))[name = string("layers_7_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_61_cast_fp16 = conv(dilations = input_61_dilations_0, groups = input_61_groups_0, pad = input_61_pad_0, pad_type = input_61_pad_type_0, strides = input_61_strides_0, weight = layers_7_mlp_gate_proj_weight_to_fp16_palettized, x = input_59_cast_fp16)[name = string("input_61_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_3238_cast_fp16 = silu(x = input_61_cast_fp16)[name = string("op_3238_cast_fp16")];
+            string var_3244_pad_type_0 = const()[name = string("op_3244_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3244_strides_0 = const()[name = string("op_3244_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3244_pad_0 = const()[name = string("op_3244_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3244_dilations_0 = const()[name = string("op_3244_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3244_groups_0 = const()[name = string("op_3244_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_7_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(119684672))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(122830464))))[name = string("layers_7_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_3244_cast_fp16 = conv(dilations = var_3244_dilations_0, groups = var_3244_groups_0, pad = var_3244_pad_0, pad_type = var_3244_pad_type_0, strides = var_3244_strides_0, weight = layers_7_mlp_up_proj_weight_to_fp16_palettized, x = input_59_cast_fp16)[name = string("op_3244_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_63_cast_fp16 = mul(x = var_3238_cast_fp16, y = var_3244_cast_fp16)[name = string("input_63_cast_fp16")];
+            string hidden_states_79_pad_type_0 = const()[name = string("hidden_states_79_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_79_strides_0 = const()[name = string("hidden_states_79_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_79_pad_0 = const()[name = string("hidden_states_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_79_dilations_0 = const()[name = string("hidden_states_79_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_79_groups_0 = const()[name = string("hidden_states_79_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_7_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(122831040))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(125976832))))[name = string("layers_7_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_79_cast_fp16 = conv(dilations = hidden_states_79_dilations_0, groups = hidden_states_79_groups_0, pad = hidden_states_79_pad_0, pad_type = hidden_states_79_pad_type_0, strides = hidden_states_79_strides_0, weight = layers_7_mlp_down_proj_weight_to_fp16_palettized, x = input_63_cast_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_63_cast_fp16 = add(x = inputs_61_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("inputs_63_cast_fp16")];
+            int32 var_3258 = const()[name = string("op_3258"), val = int32(3)];
+            int32 var_3268 = const()[name = string("op_3268"), val = int32(-2)];
+            int32 var_3276 = const()[name = string("op_3276"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_65_cast_fp16 = mul(x = inputs_63_cast_fp16, y = inputs_63_cast_fp16)[name = string("inputs_sq_65_cast_fp16")];
+            tensor<int32, [1]> variance_65_axes_0 = const()[name = string("variance_65_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_65_keep_dims_0 = const()[name = string("variance_65_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_65_cast_fp16 = reduce_mean(axes = variance_65_axes_0, keep_dims = variance_65_keep_dims_0, x = inputs_sq_65_cast_fp16)[name = string("variance_65_cast_fp16")];
+            fp16 var_3288_to_fp16 = const()[name = string("op_3288_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3289_cast_fp16 = add(x = variance_65_cast_fp16, y = var_3288_to_fp16)[name = string("op_3289_cast_fp16")];
+            fp32 var_3290_epsilon_0 = const()[name = string("op_3290_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3290_cast_fp16 = rsqrt(epsilon = var_3290_epsilon_0, x = var_3289_cast_fp16)[name = string("op_3290_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_81_cast_fp16 = mul(x = inputs_63_cast_fp16, y = var_3290_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_65_to_fp16 = const()[name = string("w_65_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(125977408)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_69_cast_fp16 = mul(x = w_65_to_fp16, y = hidden_states_81_cast_fp16)[name = string("obj_69_cast_fp16")];
+            string query_49_pad_type_0 = const()[name = string("query_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_49_strides_0 = const()[name = string("query_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_49_pad_0 = const()[name = string("query_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_49_dilations_0 = const()[name = string("query_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_49_groups_0 = const()[name = string("query_49_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_8_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(125979520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(128076736))))[name = string("layers_8_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_49_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_49_dilations_0, groups = query_49_groups_0, pad = query_49_pad_0, pad_type = query_49_pad_type_0, strides = query_49_strides_0, weight = layers_8_self_attn_q_proj_weight_to_fp16_palettized, x = obj_69_cast_fp16)[name = string("query_49_cast_fp16")];
+            string current_key_33_pad_type_0 = const()[name = string("current_key_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_33_strides_0 = const()[name = string("current_key_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_33_pad_0 = const()[name = string("current_key_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_33_dilations_0 = const()[name = string("current_key_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_33_groups_0 = const()[name = string("current_key_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(128077312))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129125952))))[name = string("layers_8_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_33_cast_fp16 = conv(dilations = current_key_33_dilations_0, groups = current_key_33_groups_0, pad = current_key_33_pad_0, pad_type = current_key_33_pad_type_0, strides = current_key_33_strides_0, weight = layers_8_self_attn_k_proj_weight_to_fp16_palettized, x = obj_69_cast_fp16)[name = string("current_key_33_cast_fp16")];
+            string current_value_17_pad_type_0 = const()[name = string("current_value_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_17_strides_0 = const()[name = string("current_value_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_17_pad_0 = const()[name = string("current_value_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_17_dilations_0 = const()[name = string("current_value_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_17_groups_0 = const()[name = string("current_value_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129126528))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130175168))))[name = string("layers_8_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_17_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_17_dilations_0, groups = current_value_17_groups_0, pad = current_value_17_pad_0, pad_type = current_value_17_pad_type_0, strides = current_value_17_strides_0, weight = layers_8_self_attn_v_proj_weight_to_fp16_palettized, x = obj_69_cast_fp16)[name = string("current_value_17_cast_fp16")];
+            tensor<int32, [4]> var_3327 = const()[name = string("op_3327"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_65_cast_fp16 = reshape(shape = var_3327, x = query_49_cast_fp16)[name = string("inputs_65_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_67_cast_fp16 = mul(x = inputs_65_cast_fp16, y = inputs_65_cast_fp16)[name = string("inputs_sq_67_cast_fp16")];
+            tensor<int32, [1]> variance_67_axes_0 = const()[name = string("variance_67_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_67_keep_dims_0 = const()[name = string("variance_67_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_67_cast_fp16 = reduce_mean(axes = variance_67_axes_0, keep_dims = variance_67_keep_dims_0, x = inputs_sq_67_cast_fp16)[name = string("variance_67_cast_fp16")];
+            fp16 var_3333_to_fp16 = const()[name = string("op_3333_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_3334_cast_fp16 = add(x = variance_67_cast_fp16, y = var_3333_to_fp16)[name = string("op_3334_cast_fp16")];
+            fp32 var_3335_epsilon_0 = const()[name = string("op_3335_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_3335_cast_fp16 = rsqrt(epsilon = var_3335_epsilon_0, x = var_3334_cast_fp16)[name = string("op_3335_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_83_cast_fp16 = mul(x = inputs_65_cast_fp16, y = var_3335_cast_fp16)[name = string("hidden_states_83_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_67_to_fp16 = const()[name = string("w_67_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130175744)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_17_cast_fp16 = mul(x = w_67_to_fp16, y = hidden_states_83_cast_fp16)[name = string("query_normed_17_cast_fp16")];
+            tensor<int32, [4]> var_3343 = const()[name = string("op_3343"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_67_cast_fp16 = reshape(shape = var_3343, x = current_key_33_cast_fp16)[name = string("inputs_67_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_69_cast_fp16 = mul(x = inputs_67_cast_fp16, y = inputs_67_cast_fp16)[name = string("inputs_sq_69_cast_fp16")];
+            tensor<int32, [1]> variance_69_axes_0 = const()[name = string("variance_69_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_69_keep_dims_0 = const()[name = string("variance_69_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_69_cast_fp16 = reduce_mean(axes = variance_69_axes_0, keep_dims = variance_69_keep_dims_0, x = inputs_sq_69_cast_fp16)[name = string("variance_69_cast_fp16")];
+            fp16 var_3349_to_fp16 = const()[name = string("op_3349_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_3350_cast_fp16 = add(x = variance_69_cast_fp16, y = var_3349_to_fp16)[name = string("op_3350_cast_fp16")];
+            fp32 var_3351_epsilon_0 = const()[name = string("op_3351_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_3351_cast_fp16 = rsqrt(epsilon = var_3351_epsilon_0, x = var_3350_cast_fp16)[name = string("op_3351_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_85_cast_fp16 = mul(x = inputs_67_cast_fp16, y = var_3351_cast_fp16)[name = string("hidden_states_85_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_69_to_fp16 = const()[name = string("w_69_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130176064)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_17_cast_fp16 = mul(x = w_69_to_fp16, y = hidden_states_85_cast_fp16)[name = string("current_key_normed_17_cast_fp16")];
+            tensor<int32, [4]> var_3369 = const()[name = string("op_3369"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_49_cast_fp16 = reshape(shape = var_3369, x = query_normed_17_cast_fp16)[name = string("mh_q_49_cast_fp16")];
+            tensor<int32, [4]> var_3371 = const()[name = string("op_3371"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_33_cast_fp16 = reshape(shape = var_3371, x = current_key_normed_17_cast_fp16)[name = string("mh_k_33_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3375_cast_fp16 = mul(x = mh_q_49_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3375_cast_fp16")];
+            tensor<int32, [4]> var_3380_begin_0 = const()[name = string("op_3380_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3380_end_0 = const()[name = string("op_3380_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_3380_end_mask_0 = const()[name = string("op_3380_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3380_cast_fp16 = slice_by_index(begin = var_3380_begin_0, end = var_3380_end_0, end_mask = var_3380_end_mask_0, x = mh_q_49_cast_fp16)[name = string("op_3380_cast_fp16")];
+            tensor<int32, [4]> var_3386_begin_0 = const()[name = string("op_3386_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3386_end_0 = const()[name = string("op_3386_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_3386_end_mask_0 = const()[name = string("op_3386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3386_cast_fp16 = slice_by_index(begin = var_3386_begin_0, end = var_3386_end_0, end_mask = var_3386_end_mask_0, x = mh_q_49_cast_fp16)[name = string("op_3386_cast_fp16")];
+            fp16 const_201_promoted_to_fp16 = const()[name = string("const_201_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_3388_cast_fp16 = mul(x = var_3386_cast_fp16, y = const_201_promoted_to_fp16)[name = string("op_3388_cast_fp16")];
+            bool var_3390_interleave_0 = const()[name = string("op_3390_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_3390_cast_fp16 = concat(axis = var_3268, interleave = var_3390_interleave_0, values = (var_3388_cast_fp16, var_3380_cast_fp16))[name = string("op_3390_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3391_cast_fp16 = mul(x = var_3390_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3391_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_51_cast_fp16 = add(x = var_3375_cast_fp16, y = var_3391_cast_fp16)[name = string("mh_q_51_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3393_cast_fp16 = mul(x = mh_k_33_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3393_cast_fp16")];
+            tensor<int32, [4]> var_3398_begin_0 = const()[name = string("op_3398_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3398_end_0 = const()[name = string("op_3398_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_3398_end_mask_0 = const()[name = string("op_3398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3398_cast_fp16 = slice_by_index(begin = var_3398_begin_0, end = var_3398_end_0, end_mask = var_3398_end_mask_0, x = mh_k_33_cast_fp16)[name = string("op_3398_cast_fp16")];
+            tensor<int32, [4]> var_3404_begin_0 = const()[name = string("op_3404_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3404_end_0 = const()[name = string("op_3404_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_3404_end_mask_0 = const()[name = string("op_3404_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3404_cast_fp16 = slice_by_index(begin = var_3404_begin_0, end = var_3404_end_0, end_mask = var_3404_end_mask_0, x = mh_k_33_cast_fp16)[name = string("op_3404_cast_fp16")];
+            fp16 const_204_promoted_to_fp16 = const()[name = string("const_204_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_3406_cast_fp16 = mul(x = var_3404_cast_fp16, y = const_204_promoted_to_fp16)[name = string("op_3406_cast_fp16")];
+            bool var_3408_interleave_0 = const()[name = string("op_3408_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_3408_cast_fp16 = concat(axis = var_3268, interleave = var_3408_interleave_0, values = (var_3406_cast_fp16, var_3398_cast_fp16))[name = string("op_3408_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3409_cast_fp16 = mul(x = var_3408_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3409_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_35_cast_fp16 = add(x = var_3393_cast_fp16, y = var_3409_cast_fp16)[name = string("mh_k_35_cast_fp16")];
+            tensor<int32, [4]> var_3413 = const()[name = string("op_3413"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_35_cast_fp16 = reshape(shape = var_3413, x = mh_k_35_cast_fp16)[name = string("current_key_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3420_cast_fp16 = mul(x = var_101_cast_fp16_8, y = var_323_cast_fp16)[name = string("op_3420_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3421_cast_fp16 = mul(x = current_key_35_cast_fp16, y = var_321_cast_fp16)[name = string("op_3421_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_51_cast_fp16 = add(x = var_3420_cast_fp16, y = var_3421_cast_fp16)[name = string("key_51_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3424_cast_fp16 = mul(x = var_132_cast_fp16_8, y = var_323_cast_fp16)[name = string("op_3424_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3425_cast_fp16 = mul(x = current_value_17_cast_fp16, y = var_321_cast_fp16)[name = string("op_3425_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_33_cast_fp16 = add(x = var_3424_cast_fp16, y = var_3425_cast_fp16)[name = string("value_33_cast_fp16")];
+            tensor<int32, [4]> var_3429 = const()[name = string("op_3429"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_33_cast_fp16 = reshape(shape = var_3429, x = key_51_cast_fp16)[name = string("key_heads_33_cast_fp16")];
+            tensor<int32, [4]> var_3431 = const()[name = string("op_3431"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_33_cast_fp16 = reshape(shape = var_3431, x = value_33_cast_fp16)[name = string("value_heads_33_cast_fp16")];
+            tensor<int32, [4]> var_3434_begin_0 = const()[name = string("op_3434_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3434_end_0 = const()[name = string("op_3434_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3434_end_mask_0 = const()[name = string("op_3434_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3434_cast_fp16 = slice_by_index(begin = var_3434_begin_0, end = var_3434_end_0, end_mask = var_3434_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3434_cast_fp16")];
+            tensor<int32, [4]> var_3438_begin_0 = const()[name = string("op_3438_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3438_end_0 = const()[name = string("op_3438_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3438_end_mask_0 = const()[name = string("op_3438_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3438_cast_fp16 = slice_by_index(begin = var_3438_begin_0, end = var_3438_end_0, end_mask = var_3438_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3438_cast_fp16")];
+            tensor<int32, [4]> var_3450_begin_0 = const()[name = string("op_3450_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3450_end_0 = const()[name = string("op_3450_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3450_end_mask_0 = const()[name = string("op_3450_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3450_cast_fp16 = slice_by_index(begin = var_3450_begin_0, end = var_3450_end_0, end_mask = var_3450_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3450_cast_fp16")];
+            tensor<int32, [4]> var_3454_begin_0 = const()[name = string("op_3454_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3454_end_0 = const()[name = string("op_3454_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3454_end_mask_0 = const()[name = string("op_3454_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3454_cast_fp16 = slice_by_index(begin = var_3454_begin_0, end = var_3454_end_0, end_mask = var_3454_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3454_cast_fp16")];
+            tensor<int32, [4]> var_3466_begin_0 = const()[name = string("op_3466_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3466_end_0 = const()[name = string("op_3466_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3466_end_mask_0 = const()[name = string("op_3466_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3466_cast_fp16 = slice_by_index(begin = var_3466_begin_0, end = var_3466_end_0, end_mask = var_3466_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3466_cast_fp16")];
+            tensor<int32, [4]> var_3470_begin_0 = const()[name = string("op_3470_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3470_end_0 = const()[name = string("op_3470_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3470_end_mask_0 = const()[name = string("op_3470_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3470_cast_fp16 = slice_by_index(begin = var_3470_begin_0, end = var_3470_end_0, end_mask = var_3470_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3470_cast_fp16")];
+            tensor<int32, [4]> var_3482_begin_0 = const()[name = string("op_3482_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3482_end_0 = const()[name = string("op_3482_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3482_end_mask_0 = const()[name = string("op_3482_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3482_cast_fp16 = slice_by_index(begin = var_3482_begin_0, end = var_3482_end_0, end_mask = var_3482_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3482_cast_fp16")];
+            tensor<int32, [4]> var_3486_begin_0 = const()[name = string("op_3486_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3486_end_0 = const()[name = string("op_3486_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3486_end_mask_0 = const()[name = string("op_3486_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3486_cast_fp16 = slice_by_index(begin = var_3486_begin_0, end = var_3486_end_0, end_mask = var_3486_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3486_cast_fp16")];
+            tensor<int32, [4]> var_3498_begin_0 = const()[name = string("op_3498_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3498_end_0 = const()[name = string("op_3498_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3498_end_mask_0 = const()[name = string("op_3498_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3498_cast_fp16 = slice_by_index(begin = var_3498_begin_0, end = var_3498_end_0, end_mask = var_3498_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3498_cast_fp16")];
+            tensor<int32, [4]> var_3502_begin_0 = const()[name = string("op_3502_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3502_end_0 = const()[name = string("op_3502_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3502_end_mask_0 = const()[name = string("op_3502_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3502_cast_fp16 = slice_by_index(begin = var_3502_begin_0, end = var_3502_end_0, end_mask = var_3502_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3502_cast_fp16")];
+            tensor<int32, [4]> var_3514_begin_0 = const()[name = string("op_3514_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3514_end_0 = const()[name = string("op_3514_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3514_end_mask_0 = const()[name = string("op_3514_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3514_cast_fp16 = slice_by_index(begin = var_3514_begin_0, end = var_3514_end_0, end_mask = var_3514_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3514_cast_fp16")];
+            tensor<int32, [4]> var_3518_begin_0 = const()[name = string("op_3518_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3518_end_0 = const()[name = string("op_3518_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3518_end_mask_0 = const()[name = string("op_3518_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3518_cast_fp16 = slice_by_index(begin = var_3518_begin_0, end = var_3518_end_0, end_mask = var_3518_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3518_cast_fp16")];
+            tensor<int32, [4]> var_3530_begin_0 = const()[name = string("op_3530_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3530_end_0 = const()[name = string("op_3530_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3530_end_mask_0 = const()[name = string("op_3530_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3530_cast_fp16 = slice_by_index(begin = var_3530_begin_0, end = var_3530_end_0, end_mask = var_3530_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3530_cast_fp16")];
+            tensor<int32, [4]> var_3534_begin_0 = const()[name = string("op_3534_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3534_end_0 = const()[name = string("op_3534_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3534_end_mask_0 = const()[name = string("op_3534_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3534_cast_fp16 = slice_by_index(begin = var_3534_begin_0, end = var_3534_end_0, end_mask = var_3534_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3534_cast_fp16")];
+            tensor<int32, [4]> var_3546_begin_0 = const()[name = string("op_3546_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3546_end_0 = const()[name = string("op_3546_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3546_end_mask_0 = const()[name = string("op_3546_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3546_cast_fp16 = slice_by_index(begin = var_3546_begin_0, end = var_3546_end_0, end_mask = var_3546_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3546_cast_fp16")];
+            tensor<int32, [4]> var_3550_begin_0 = const()[name = string("op_3550_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3550_end_0 = const()[name = string("op_3550_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3550_end_mask_0 = const()[name = string("op_3550_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3550_cast_fp16 = slice_by_index(begin = var_3550_begin_0, end = var_3550_end_0, end_mask = var_3550_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3550_cast_fp16")];
+            bool key_heads_35_interleave_0 = const()[name = string("key_heads_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_35_cast_fp16 = concat(axis = var_3276, interleave = key_heads_35_interleave_0, values = (var_3434_cast_fp16, var_3434_cast_fp16, var_3450_cast_fp16, var_3450_cast_fp16, var_3466_cast_fp16, var_3466_cast_fp16, var_3482_cast_fp16, var_3482_cast_fp16, var_3498_cast_fp16, var_3498_cast_fp16, var_3514_cast_fp16, var_3514_cast_fp16, var_3530_cast_fp16, var_3530_cast_fp16, var_3546_cast_fp16, var_3546_cast_fp16))[name = string("key_heads_35_cast_fp16")];
+            bool value_heads_35_interleave_0 = const()[name = string("value_heads_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_35_cast_fp16 = concat(axis = var_3276, interleave = value_heads_35_interleave_0, values = (var_3438_cast_fp16, var_3438_cast_fp16, var_3454_cast_fp16, var_3454_cast_fp16, var_3470_cast_fp16, var_3470_cast_fp16, var_3486_cast_fp16, var_3486_cast_fp16, var_3502_cast_fp16, var_3502_cast_fp16, var_3518_cast_fp16, var_3518_cast_fp16, var_3534_cast_fp16, var_3534_cast_fp16, var_3550_cast_fp16, var_3550_cast_fp16))[name = string("value_heads_35_cast_fp16")];
+            fp16 var_3573_to_fp16 = const()[name = string("op_3573_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_3574_cast_fp16 = mul(x = mh_q_51_cast_fp16, y = var_3573_to_fp16)[name = string("op_3574_cast_fp16")];
+            bool mh_w_33_transpose_x_0 = const()[name = string("mh_w_33_transpose_x_0"), val = bool(true)];
+            bool mh_w_33_transpose_y_0 = const()[name = string("mh_w_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_33_cast_fp16 = matmul(transpose_x = mh_w_33_transpose_x_0, transpose_y = mh_w_33_transpose_y_0, x = var_3574_cast_fp16, y = key_heads_35_cast_fp16)[name = string("mh_w_33_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_35_cast_fp16 = add(x = mh_w_33_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_35_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_3586_cast_fp16 = softmax(axis = var_3258, x = mh_w_35_cast_fp16)[name = string("op_3586_cast_fp16")];
+            bool attn_17_transpose_x_0 = const()[name = string("attn_17_transpose_x_0"), val = bool(false)];
+            bool attn_17_transpose_y_0 = const()[name = string("attn_17_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_17_cast_fp16 = matmul(transpose_x = attn_17_transpose_x_0, transpose_y = attn_17_transpose_y_0, x = value_heads_35_cast_fp16, y = var_3586_cast_fp16)[name = string("attn_17_cast_fp16")];
+            tensor<int32, [4]> var_3591 = const()[name = string("op_3591"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_65_cast_fp16 = reshape(shape = var_3591, x = attn_17_cast_fp16)[name = string("input_65_cast_fp16")];
+            string obj_75_pad_type_0 = const()[name = string("obj_75_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_75_strides_0 = const()[name = string("obj_75_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_75_pad_0 = const()[name = string("obj_75_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_75_dilations_0 = const()[name = string("obj_75_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_75_groups_0 = const()[name = string("obj_75_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_8_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130176384))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132273600))))[name = string("layers_8_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_75_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_75_dilations_0, groups = obj_75_groups_0, pad = obj_75_pad_0, pad_type = obj_75_pad_type_0, strides = obj_75_strides_0, weight = layers_8_self_attn_o_proj_weight_to_fp16_palettized, x = input_65_cast_fp16)[name = string("obj_75_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_69_cast_fp16 = add(x = inputs_63_cast_fp16, y = obj_75_cast_fp16)[name = string("inputs_69_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_71_cast_fp16 = mul(x = inputs_69_cast_fp16, y = inputs_69_cast_fp16)[name = string("inputs_sq_71_cast_fp16")];
+            tensor<int32, [1]> variance_71_axes_0 = const()[name = string("variance_71_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_71_keep_dims_0 = const()[name = string("variance_71_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_71_cast_fp16 = reduce_mean(axes = variance_71_axes_0, keep_dims = variance_71_keep_dims_0, x = inputs_sq_71_cast_fp16)[name = string("variance_71_cast_fp16")];
+            fp16 var_3609_to_fp16 = const()[name = string("op_3609_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3610_cast_fp16 = add(x = variance_71_cast_fp16, y = var_3609_to_fp16)[name = string("op_3610_cast_fp16")];
+            fp32 var_3611_epsilon_0 = const()[name = string("op_3611_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3611_cast_fp16 = rsqrt(epsilon = var_3611_epsilon_0, x = var_3610_cast_fp16)[name = string("op_3611_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_87_cast_fp16 = mul(x = inputs_69_cast_fp16, y = var_3611_cast_fp16)[name = string("hidden_states_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_71_to_fp16 = const()[name = string("w_71_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132274176)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_67_cast_fp16 = mul(x = w_71_to_fp16, y = hidden_states_87_cast_fp16)[name = string("input_67_cast_fp16")];
+            string input_69_pad_type_0 = const()[name = string("input_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_69_strides_0 = const()[name = string("input_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_69_pad_0 = const()[name = string("input_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_69_dilations_0 = const()[name = string("input_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_69_groups_0 = const()[name = string("input_69_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_8_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132276288))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(135422080))))[name = string("layers_8_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_69_cast_fp16 = conv(dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = layers_8_mlp_gate_proj_weight_to_fp16_palettized, x = input_67_cast_fp16)[name = string("input_69_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_3625_cast_fp16 = silu(x = input_69_cast_fp16)[name = string("op_3625_cast_fp16")];
+            string var_3631_pad_type_0 = const()[name = string("op_3631_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3631_strides_0 = const()[name = string("op_3631_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3631_pad_0 = const()[name = string("op_3631_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3631_dilations_0 = const()[name = string("op_3631_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3631_groups_0 = const()[name = string("op_3631_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_8_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(135422656))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138568448))))[name = string("layers_8_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_3631_cast_fp16 = conv(dilations = var_3631_dilations_0, groups = var_3631_groups_0, pad = var_3631_pad_0, pad_type = var_3631_pad_type_0, strides = var_3631_strides_0, weight = layers_8_mlp_up_proj_weight_to_fp16_palettized, x = input_67_cast_fp16)[name = string("op_3631_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_71_cast_fp16 = mul(x = var_3625_cast_fp16, y = var_3631_cast_fp16)[name = string("input_71_cast_fp16")];
+            string hidden_states_89_pad_type_0 = const()[name = string("hidden_states_89_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_89_strides_0 = const()[name = string("hidden_states_89_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_89_pad_0 = const()[name = string("hidden_states_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_89_dilations_0 = const()[name = string("hidden_states_89_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_89_groups_0 = const()[name = string("hidden_states_89_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_8_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138569024))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141714816))))[name = string("layers_8_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_89_cast_fp16 = conv(dilations = hidden_states_89_dilations_0, groups = hidden_states_89_groups_0, pad = hidden_states_89_pad_0, pad_type = hidden_states_89_pad_type_0, strides = hidden_states_89_strides_0, weight = layers_8_mlp_down_proj_weight_to_fp16_palettized, x = input_71_cast_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_71_cast_fp16 = add(x = inputs_69_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("inputs_71_cast_fp16")];
+            int32 var_3645 = const()[name = string("op_3645"), val = int32(3)];
+            int32 var_3655 = const()[name = string("op_3655"), val = int32(-2)];
+            int32 var_3663 = const()[name = string("op_3663"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_73_cast_fp16 = mul(x = inputs_71_cast_fp16, y = inputs_71_cast_fp16)[name = string("inputs_sq_73_cast_fp16")];
+            tensor<int32, [1]> variance_73_axes_0 = const()[name = string("variance_73_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_73_keep_dims_0 = const()[name = string("variance_73_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_73_cast_fp16 = reduce_mean(axes = variance_73_axes_0, keep_dims = variance_73_keep_dims_0, x = inputs_sq_73_cast_fp16)[name = string("variance_73_cast_fp16")];
+            fp16 var_3675_to_fp16 = const()[name = string("op_3675_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3676_cast_fp16 = add(x = variance_73_cast_fp16, y = var_3675_to_fp16)[name = string("op_3676_cast_fp16")];
+            fp32 var_3677_epsilon_0 = const()[name = string("op_3677_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3677_cast_fp16 = rsqrt(epsilon = var_3677_epsilon_0, x = var_3676_cast_fp16)[name = string("op_3677_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_91_cast_fp16 = mul(x = inputs_71_cast_fp16, y = var_3677_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_73_to_fp16 = const()[name = string("w_73_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141715392)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_77_cast_fp16 = mul(x = w_73_to_fp16, y = hidden_states_91_cast_fp16)[name = string("obj_77_cast_fp16")];
+            string query_55_pad_type_0 = const()[name = string("query_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_55_strides_0 = const()[name = string("query_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_55_pad_0 = const()[name = string("query_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_55_dilations_0 = const()[name = string("query_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_55_groups_0 = const()[name = string("query_55_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_9_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141717504))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(143814720))))[name = string("layers_9_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_55_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_55_dilations_0, groups = query_55_groups_0, pad = query_55_pad_0, pad_type = query_55_pad_type_0, strides = query_55_strides_0, weight = layers_9_self_attn_q_proj_weight_to_fp16_palettized, x = obj_77_cast_fp16)[name = string("query_55_cast_fp16")];
+            string current_key_37_pad_type_0 = const()[name = string("current_key_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_37_strides_0 = const()[name = string("current_key_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_37_pad_0 = const()[name = string("current_key_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_37_dilations_0 = const()[name = string("current_key_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_37_groups_0 = const()[name = string("current_key_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(143815296))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144863936))))[name = string("layers_9_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_37_cast_fp16 = conv(dilations = current_key_37_dilations_0, groups = current_key_37_groups_0, pad = current_key_37_pad_0, pad_type = current_key_37_pad_type_0, strides = current_key_37_strides_0, weight = layers_9_self_attn_k_proj_weight_to_fp16_palettized, x = obj_77_cast_fp16)[name = string("current_key_37_cast_fp16")];
+            string current_value_19_pad_type_0 = const()[name = string("current_value_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_19_strides_0 = const()[name = string("current_value_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_19_pad_0 = const()[name = string("current_value_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_19_dilations_0 = const()[name = string("current_value_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_19_groups_0 = const()[name = string("current_value_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144864512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145913152))))[name = string("layers_9_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_19_dilations_0, groups = current_value_19_groups_0, pad = current_value_19_pad_0, pad_type = current_value_19_pad_type_0, strides = current_value_19_strides_0, weight = layers_9_self_attn_v_proj_weight_to_fp16_palettized, x = obj_77_cast_fp16)[name = string("current_value_19_cast_fp16")];
+            tensor<int32, [4]> var_3714 = const()[name = string("op_3714"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_73_cast_fp16 = reshape(shape = var_3714, x = query_55_cast_fp16)[name = string("inputs_73_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_75_cast_fp16 = mul(x = inputs_73_cast_fp16, y = inputs_73_cast_fp16)[name = string("inputs_sq_75_cast_fp16")];
+            tensor<int32, [1]> variance_75_axes_0 = const()[name = string("variance_75_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_75_keep_dims_0 = const()[name = string("variance_75_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_75_cast_fp16 = reduce_mean(axes = variance_75_axes_0, keep_dims = variance_75_keep_dims_0, x = inputs_sq_75_cast_fp16)[name = string("variance_75_cast_fp16")];
+            fp16 var_3720_to_fp16 = const()[name = string("op_3720_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_3721_cast_fp16 = add(x = variance_75_cast_fp16, y = var_3720_to_fp16)[name = string("op_3721_cast_fp16")];
+            fp32 var_3722_epsilon_0 = const()[name = string("op_3722_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_3722_cast_fp16 = rsqrt(epsilon = var_3722_epsilon_0, x = var_3721_cast_fp16)[name = string("op_3722_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_93_cast_fp16 = mul(x = inputs_73_cast_fp16, y = var_3722_cast_fp16)[name = string("hidden_states_93_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_75_to_fp16 = const()[name = string("w_75_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145913728)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_19_cast_fp16 = mul(x = w_75_to_fp16, y = hidden_states_93_cast_fp16)[name = string("query_normed_19_cast_fp16")];
+            tensor<int32, [4]> var_3730 = const()[name = string("op_3730"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_75_cast_fp16 = reshape(shape = var_3730, x = current_key_37_cast_fp16)[name = string("inputs_75_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_77_cast_fp16 = mul(x = inputs_75_cast_fp16, y = inputs_75_cast_fp16)[name = string("inputs_sq_77_cast_fp16")];
+            tensor<int32, [1]> variance_77_axes_0 = const()[name = string("variance_77_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_77_keep_dims_0 = const()[name = string("variance_77_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_77_cast_fp16 = reduce_mean(axes = variance_77_axes_0, keep_dims = variance_77_keep_dims_0, x = inputs_sq_77_cast_fp16)[name = string("variance_77_cast_fp16")];
+            fp16 var_3736_to_fp16 = const()[name = string("op_3736_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_3737_cast_fp16 = add(x = variance_77_cast_fp16, y = var_3736_to_fp16)[name = string("op_3737_cast_fp16")];
+            fp32 var_3738_epsilon_0 = const()[name = string("op_3738_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_3738_cast_fp16 = rsqrt(epsilon = var_3738_epsilon_0, x = var_3737_cast_fp16)[name = string("op_3738_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_95_cast_fp16 = mul(x = inputs_75_cast_fp16, y = var_3738_cast_fp16)[name = string("hidden_states_95_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_77_to_fp16 = const()[name = string("w_77_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145914048)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_19_cast_fp16 = mul(x = w_77_to_fp16, y = hidden_states_95_cast_fp16)[name = string("current_key_normed_19_cast_fp16")];
+            tensor<int32, [4]> var_3756 = const()[name = string("op_3756"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_55_cast_fp16 = reshape(shape = var_3756, x = query_normed_19_cast_fp16)[name = string("mh_q_55_cast_fp16")];
+            tensor<int32, [4]> var_3758 = const()[name = string("op_3758"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_37_cast_fp16 = reshape(shape = var_3758, x = current_key_normed_19_cast_fp16)[name = string("mh_k_37_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3762_cast_fp16 = mul(x = mh_q_55_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3762_cast_fp16")];
+            tensor<int32, [4]> var_3767_begin_0 = const()[name = string("op_3767_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3767_end_0 = const()[name = string("op_3767_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_3767_end_mask_0 = const()[name = string("op_3767_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3767_cast_fp16 = slice_by_index(begin = var_3767_begin_0, end = var_3767_end_0, end_mask = var_3767_end_mask_0, x = mh_q_55_cast_fp16)[name = string("op_3767_cast_fp16")];
+            tensor<int32, [4]> var_3773_begin_0 = const()[name = string("op_3773_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3773_end_0 = const()[name = string("op_3773_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_3773_end_mask_0 = const()[name = string("op_3773_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3773_cast_fp16 = slice_by_index(begin = var_3773_begin_0, end = var_3773_end_0, end_mask = var_3773_end_mask_0, x = mh_q_55_cast_fp16)[name = string("op_3773_cast_fp16")];
+            fp16 const_224_promoted_to_fp16 = const()[name = string("const_224_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_3775_cast_fp16 = mul(x = var_3773_cast_fp16, y = const_224_promoted_to_fp16)[name = string("op_3775_cast_fp16")];
+            bool var_3777_interleave_0 = const()[name = string("op_3777_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_3777_cast_fp16 = concat(axis = var_3655, interleave = var_3777_interleave_0, values = (var_3775_cast_fp16, var_3767_cast_fp16))[name = string("op_3777_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3778_cast_fp16 = mul(x = var_3777_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3778_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_57_cast_fp16 = add(x = var_3762_cast_fp16, y = var_3778_cast_fp16)[name = string("mh_q_57_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3780_cast_fp16 = mul(x = mh_k_37_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3780_cast_fp16")];
+            tensor<int32, [4]> var_3785_begin_0 = const()[name = string("op_3785_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3785_end_0 = const()[name = string("op_3785_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_3785_end_mask_0 = const()[name = string("op_3785_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3785_cast_fp16 = slice_by_index(begin = var_3785_begin_0, end = var_3785_end_0, end_mask = var_3785_end_mask_0, x = mh_k_37_cast_fp16)[name = string("op_3785_cast_fp16")];
+            tensor<int32, [4]> var_3791_begin_0 = const()[name = string("op_3791_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3791_end_0 = const()[name = string("op_3791_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_3791_end_mask_0 = const()[name = string("op_3791_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3791_cast_fp16 = slice_by_index(begin = var_3791_begin_0, end = var_3791_end_0, end_mask = var_3791_end_mask_0, x = mh_k_37_cast_fp16)[name = string("op_3791_cast_fp16")];
+            fp16 const_227_promoted_to_fp16 = const()[name = string("const_227_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_3793_cast_fp16 = mul(x = var_3791_cast_fp16, y = const_227_promoted_to_fp16)[name = string("op_3793_cast_fp16")];
+            bool var_3795_interleave_0 = const()[name = string("op_3795_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_3795_cast_fp16 = concat(axis = var_3655, interleave = var_3795_interleave_0, values = (var_3793_cast_fp16, var_3785_cast_fp16))[name = string("op_3795_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3796_cast_fp16 = mul(x = var_3795_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3796_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_39_cast_fp16 = add(x = var_3780_cast_fp16, y = var_3796_cast_fp16)[name = string("mh_k_39_cast_fp16")];
+            tensor<int32, [4]> var_3800 = const()[name = string("op_3800"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_39_cast_fp16 = reshape(shape = var_3800, x = mh_k_39_cast_fp16)[name = string("current_key_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3807_cast_fp16 = mul(x = var_101_cast_fp16_9, y = var_323_cast_fp16)[name = string("op_3807_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3808_cast_fp16 = mul(x = current_key_39_cast_fp16, y = var_321_cast_fp16)[name = string("op_3808_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_57_cast_fp16 = add(x = var_3807_cast_fp16, y = var_3808_cast_fp16)[name = string("key_57_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3811_cast_fp16 = mul(x = var_132_cast_fp16_9, y = var_323_cast_fp16)[name = string("op_3811_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3812_cast_fp16 = mul(x = current_value_19_cast_fp16, y = var_321_cast_fp16)[name = string("op_3812_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_37_cast_fp16 = add(x = var_3811_cast_fp16, y = var_3812_cast_fp16)[name = string("value_37_cast_fp16")];
+            tensor<int32, [4]> var_3816 = const()[name = string("op_3816"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_37_cast_fp16 = reshape(shape = var_3816, x = key_57_cast_fp16)[name = string("key_heads_37_cast_fp16")];
+            tensor<int32, [4]> var_3818 = const()[name = string("op_3818"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_37_cast_fp16 = reshape(shape = var_3818, x = value_37_cast_fp16)[name = string("value_heads_37_cast_fp16")];
+            tensor<int32, [4]> var_3821_begin_0 = const()[name = string("op_3821_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3821_end_0 = const()[name = string("op_3821_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3821_end_mask_0 = const()[name = string("op_3821_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3821_cast_fp16 = slice_by_index(begin = var_3821_begin_0, end = var_3821_end_0, end_mask = var_3821_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3821_cast_fp16")];
+            tensor<int32, [4]> var_3825_begin_0 = const()[name = string("op_3825_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3825_end_0 = const()[name = string("op_3825_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3825_end_mask_0 = const()[name = string("op_3825_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3825_cast_fp16 = slice_by_index(begin = var_3825_begin_0, end = var_3825_end_0, end_mask = var_3825_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3825_cast_fp16")];
+            tensor<int32, [4]> var_3837_begin_0 = const()[name = string("op_3837_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3837_end_0 = const()[name = string("op_3837_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3837_end_mask_0 = const()[name = string("op_3837_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3837_cast_fp16 = slice_by_index(begin = var_3837_begin_0, end = var_3837_end_0, end_mask = var_3837_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3837_cast_fp16")];
+            tensor<int32, [4]> var_3841_begin_0 = const()[name = string("op_3841_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3841_end_0 = const()[name = string("op_3841_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3841_end_mask_0 = const()[name = string("op_3841_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3841_cast_fp16 = slice_by_index(begin = var_3841_begin_0, end = var_3841_end_0, end_mask = var_3841_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3841_cast_fp16")];
+            tensor<int32, [4]> var_3853_begin_0 = const()[name = string("op_3853_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3853_end_0 = const()[name = string("op_3853_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3853_end_mask_0 = const()[name = string("op_3853_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3853_cast_fp16 = slice_by_index(begin = var_3853_begin_0, end = var_3853_end_0, end_mask = var_3853_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3853_cast_fp16")];
+            tensor<int32, [4]> var_3857_begin_0 = const()[name = string("op_3857_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3857_end_0 = const()[name = string("op_3857_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3857_end_mask_0 = const()[name = string("op_3857_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3857_cast_fp16 = slice_by_index(begin = var_3857_begin_0, end = var_3857_end_0, end_mask = var_3857_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3857_cast_fp16")];
+            tensor<int32, [4]> var_3869_begin_0 = const()[name = string("op_3869_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3869_end_0 = const()[name = string("op_3869_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3869_end_mask_0 = const()[name = string("op_3869_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3869_cast_fp16 = slice_by_index(begin = var_3869_begin_0, end = var_3869_end_0, end_mask = var_3869_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3869_cast_fp16")];
+            tensor<int32, [4]> var_3873_begin_0 = const()[name = string("op_3873_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3873_end_0 = const()[name = string("op_3873_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3873_end_mask_0 = const()[name = string("op_3873_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3873_cast_fp16 = slice_by_index(begin = var_3873_begin_0, end = var_3873_end_0, end_mask = var_3873_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3873_cast_fp16")];
+            tensor<int32, [4]> var_3885_begin_0 = const()[name = string("op_3885_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3885_end_0 = const()[name = string("op_3885_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3885_end_mask_0 = const()[name = string("op_3885_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3885_cast_fp16 = slice_by_index(begin = var_3885_begin_0, end = var_3885_end_0, end_mask = var_3885_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3885_cast_fp16")];
+            tensor<int32, [4]> var_3889_begin_0 = const()[name = string("op_3889_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3889_end_0 = const()[name = string("op_3889_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3889_end_mask_0 = const()[name = string("op_3889_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3889_cast_fp16 = slice_by_index(begin = var_3889_begin_0, end = var_3889_end_0, end_mask = var_3889_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3889_cast_fp16")];
+            tensor<int32, [4]> var_3901_begin_0 = const()[name = string("op_3901_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3901_end_0 = const()[name = string("op_3901_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3901_end_mask_0 = const()[name = string("op_3901_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3901_cast_fp16 = slice_by_index(begin = var_3901_begin_0, end = var_3901_end_0, end_mask = var_3901_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3901_cast_fp16")];
+            tensor<int32, [4]> var_3905_begin_0 = const()[name = string("op_3905_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3905_end_0 = const()[name = string("op_3905_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3905_end_mask_0 = const()[name = string("op_3905_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3905_cast_fp16 = slice_by_index(begin = var_3905_begin_0, end = var_3905_end_0, end_mask = var_3905_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3905_cast_fp16")];
+            tensor<int32, [4]> var_3917_begin_0 = const()[name = string("op_3917_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3917_end_0 = const()[name = string("op_3917_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3917_end_mask_0 = const()[name = string("op_3917_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3917_cast_fp16 = slice_by_index(begin = var_3917_begin_0, end = var_3917_end_0, end_mask = var_3917_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3917_cast_fp16")];
+            tensor<int32, [4]> var_3921_begin_0 = const()[name = string("op_3921_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3921_end_0 = const()[name = string("op_3921_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3921_end_mask_0 = const()[name = string("op_3921_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3921_cast_fp16 = slice_by_index(begin = var_3921_begin_0, end = var_3921_end_0, end_mask = var_3921_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3921_cast_fp16")];
+            tensor<int32, [4]> var_3933_begin_0 = const()[name = string("op_3933_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3933_end_0 = const()[name = string("op_3933_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3933_end_mask_0 = const()[name = string("op_3933_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3933_cast_fp16 = slice_by_index(begin = var_3933_begin_0, end = var_3933_end_0, end_mask = var_3933_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3933_cast_fp16")];
+            tensor<int32, [4]> var_3937_begin_0 = const()[name = string("op_3937_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3937_end_0 = const()[name = string("op_3937_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3937_end_mask_0 = const()[name = string("op_3937_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3937_cast_fp16 = slice_by_index(begin = var_3937_begin_0, end = var_3937_end_0, end_mask = var_3937_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3937_cast_fp16")];
+            bool key_heads_39_interleave_0 = const()[name = string("key_heads_39_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_39_cast_fp16 = concat(axis = var_3663, interleave = key_heads_39_interleave_0, values = (var_3821_cast_fp16, var_3821_cast_fp16, var_3837_cast_fp16, var_3837_cast_fp16, var_3853_cast_fp16, var_3853_cast_fp16, var_3869_cast_fp16, var_3869_cast_fp16, var_3885_cast_fp16, var_3885_cast_fp16, var_3901_cast_fp16, var_3901_cast_fp16, var_3917_cast_fp16, var_3917_cast_fp16, var_3933_cast_fp16, var_3933_cast_fp16))[name = string("key_heads_39_cast_fp16")];
+            bool value_heads_39_interleave_0 = const()[name = string("value_heads_39_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_39_cast_fp16 = concat(axis = var_3663, interleave = value_heads_39_interleave_0, values = (var_3825_cast_fp16, var_3825_cast_fp16, var_3841_cast_fp16, var_3841_cast_fp16, var_3857_cast_fp16, var_3857_cast_fp16, var_3873_cast_fp16, var_3873_cast_fp16, var_3889_cast_fp16, var_3889_cast_fp16, var_3905_cast_fp16, var_3905_cast_fp16, var_3921_cast_fp16, var_3921_cast_fp16, var_3937_cast_fp16, var_3937_cast_fp16))[name = string("value_heads_39_cast_fp16")];
+            fp16 var_3960_to_fp16 = const()[name = string("op_3960_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_3961_cast_fp16 = mul(x = mh_q_57_cast_fp16, y = var_3960_to_fp16)[name = string("op_3961_cast_fp16")];
+            bool mh_w_37_transpose_x_0 = const()[name = string("mh_w_37_transpose_x_0"), val = bool(true)];
+            bool mh_w_37_transpose_y_0 = const()[name = string("mh_w_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_37_cast_fp16 = matmul(transpose_x = mh_w_37_transpose_x_0, transpose_y = mh_w_37_transpose_y_0, x = var_3961_cast_fp16, y = key_heads_39_cast_fp16)[name = string("mh_w_37_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_39_cast_fp16 = add(x = mh_w_37_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_39_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_3973_cast_fp16 = softmax(axis = var_3645, x = mh_w_39_cast_fp16)[name = string("op_3973_cast_fp16")];
+            bool attn_19_transpose_x_0 = const()[name = string("attn_19_transpose_x_0"), val = bool(false)];
+            bool attn_19_transpose_y_0 = const()[name = string("attn_19_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_19_cast_fp16 = matmul(transpose_x = attn_19_transpose_x_0, transpose_y = attn_19_transpose_y_0, x = value_heads_39_cast_fp16, y = var_3973_cast_fp16)[name = string("attn_19_cast_fp16")];
+            tensor<int32, [4]> var_3978 = const()[name = string("op_3978"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_73_cast_fp16 = reshape(shape = var_3978, x = attn_19_cast_fp16)[name = string("input_73_cast_fp16")];
+            string obj_83_pad_type_0 = const()[name = string("obj_83_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_83_strides_0 = const()[name = string("obj_83_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_83_pad_0 = const()[name = string("obj_83_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_83_dilations_0 = const()[name = string("obj_83_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_83_groups_0 = const()[name = string("obj_83_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_9_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145914368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148011584))))[name = string("layers_9_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_83_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_83_dilations_0, groups = obj_83_groups_0, pad = obj_83_pad_0, pad_type = obj_83_pad_type_0, strides = obj_83_strides_0, weight = layers_9_self_attn_o_proj_weight_to_fp16_palettized, x = input_73_cast_fp16)[name = string("obj_83_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_77_cast_fp16 = add(x = inputs_71_cast_fp16, y = obj_83_cast_fp16)[name = string("inputs_77_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_79_cast_fp16 = mul(x = inputs_77_cast_fp16, y = inputs_77_cast_fp16)[name = string("inputs_sq_79_cast_fp16")];
+            tensor<int32, [1]> variance_79_axes_0 = const()[name = string("variance_79_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_79_keep_dims_0 = const()[name = string("variance_79_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_79_cast_fp16 = reduce_mean(axes = variance_79_axes_0, keep_dims = variance_79_keep_dims_0, x = inputs_sq_79_cast_fp16)[name = string("variance_79_cast_fp16")];
+            fp16 var_3996_to_fp16 = const()[name = string("op_3996_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3997_cast_fp16 = add(x = variance_79_cast_fp16, y = var_3996_to_fp16)[name = string("op_3997_cast_fp16")];
+            fp32 var_3998_epsilon_0 = const()[name = string("op_3998_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3998_cast_fp16 = rsqrt(epsilon = var_3998_epsilon_0, x = var_3997_cast_fp16)[name = string("op_3998_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_97_cast_fp16 = mul(x = inputs_77_cast_fp16, y = var_3998_cast_fp16)[name = string("hidden_states_97_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_79_to_fp16 = const()[name = string("w_79_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148012160)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_75_cast_fp16 = mul(x = w_79_to_fp16, y = hidden_states_97_cast_fp16)[name = string("input_75_cast_fp16")];
+            string input_77_pad_type_0 = const()[name = string("input_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_77_strides_0 = const()[name = string("input_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_77_pad_0 = const()[name = string("input_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_77_dilations_0 = const()[name = string("input_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_77_groups_0 = const()[name = string("input_77_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_9_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148014272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151160064))))[name = string("layers_9_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_77_cast_fp16 = conv(dilations = input_77_dilations_0, groups = input_77_groups_0, pad = input_77_pad_0, pad_type = input_77_pad_type_0, strides = input_77_strides_0, weight = layers_9_mlp_gate_proj_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = string("input_77_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_4012_cast_fp16 = silu(x = input_77_cast_fp16)[name = string("op_4012_cast_fp16")];
+            string var_4018_pad_type_0 = const()[name = string("op_4018_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4018_strides_0 = const()[name = string("op_4018_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4018_pad_0 = const()[name = string("op_4018_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4018_dilations_0 = const()[name = string("op_4018_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4018_groups_0 = const()[name = string("op_4018_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_9_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151160640))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(154306432))))[name = string("layers_9_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_4018_cast_fp16 = conv(dilations = var_4018_dilations_0, groups = var_4018_groups_0, pad = var_4018_pad_0, pad_type = var_4018_pad_type_0, strides = var_4018_strides_0, weight = layers_9_mlp_up_proj_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = string("op_4018_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_79_cast_fp16 = mul(x = var_4012_cast_fp16, y = var_4018_cast_fp16)[name = string("input_79_cast_fp16")];
+            string hidden_states_99_pad_type_0 = const()[name = string("hidden_states_99_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_99_strides_0 = const()[name = string("hidden_states_99_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_99_pad_0 = const()[name = string("hidden_states_99_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_99_dilations_0 = const()[name = string("hidden_states_99_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_99_groups_0 = const()[name = string("hidden_states_99_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_9_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(154307008))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157452800))))[name = string("layers_9_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_99_cast_fp16 = conv(dilations = hidden_states_99_dilations_0, groups = hidden_states_99_groups_0, pad = hidden_states_99_pad_0, pad_type = hidden_states_99_pad_type_0, strides = hidden_states_99_strides_0, weight = layers_9_mlp_down_proj_weight_to_fp16_palettized, x = input_79_cast_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_79_cast_fp16 = add(x = inputs_77_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("inputs_79_cast_fp16")];
+            int32 var_4032 = const()[name = string("op_4032"), val = int32(3)];
+            int32 var_4042 = const()[name = string("op_4042"), val = int32(-2)];
+            int32 var_4050 = const()[name = string("op_4050"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_81_cast_fp16 = mul(x = inputs_79_cast_fp16, y = inputs_79_cast_fp16)[name = string("inputs_sq_81_cast_fp16")];
+            tensor<int32, [1]> variance_81_axes_0 = const()[name = string("variance_81_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_81_keep_dims_0 = const()[name = string("variance_81_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_81_cast_fp16 = reduce_mean(axes = variance_81_axes_0, keep_dims = variance_81_keep_dims_0, x = inputs_sq_81_cast_fp16)[name = string("variance_81_cast_fp16")];
+            fp16 var_4062_to_fp16 = const()[name = string("op_4062_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4063_cast_fp16 = add(x = variance_81_cast_fp16, y = var_4062_to_fp16)[name = string("op_4063_cast_fp16")];
+            fp32 var_4064_epsilon_0 = const()[name = string("op_4064_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4064_cast_fp16 = rsqrt(epsilon = var_4064_epsilon_0, x = var_4063_cast_fp16)[name = string("op_4064_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_101_cast_fp16 = mul(x = inputs_79_cast_fp16, y = var_4064_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_81_to_fp16 = const()[name = string("w_81_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157453376)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_85_cast_fp16 = mul(x = w_81_to_fp16, y = hidden_states_101_cast_fp16)[name = string("obj_85_cast_fp16")];
+            string query_61_pad_type_0 = const()[name = string("query_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_61_strides_0 = const()[name = string("query_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_61_pad_0 = const()[name = string("query_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_61_dilations_0 = const()[name = string("query_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_61_groups_0 = const()[name = string("query_61_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_10_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157455488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(159552704))))[name = string("layers_10_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_61_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_61_dilations_0, groups = query_61_groups_0, pad = query_61_pad_0, pad_type = query_61_pad_type_0, strides = query_61_strides_0, weight = layers_10_self_attn_q_proj_weight_to_fp16_palettized, x = obj_85_cast_fp16)[name = string("query_61_cast_fp16")];
+            string current_key_41_pad_type_0 = const()[name = string("current_key_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_41_strides_0 = const()[name = string("current_key_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_41_pad_0 = const()[name = string("current_key_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_41_dilations_0 = const()[name = string("current_key_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_41_groups_0 = const()[name = string("current_key_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(159553280))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(160601920))))[name = string("layers_10_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_41_cast_fp16 = conv(dilations = current_key_41_dilations_0, groups = current_key_41_groups_0, pad = current_key_41_pad_0, pad_type = current_key_41_pad_type_0, strides = current_key_41_strides_0, weight = layers_10_self_attn_k_proj_weight_to_fp16_palettized, x = obj_85_cast_fp16)[name = string("current_key_41_cast_fp16")];
+            string current_value_21_pad_type_0 = const()[name = string("current_value_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_21_strides_0 = const()[name = string("current_value_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_21_pad_0 = const()[name = string("current_value_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_21_dilations_0 = const()[name = string("current_value_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_21_groups_0 = const()[name = string("current_value_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(160602496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161651136))))[name = string("layers_10_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_21_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_21_dilations_0, groups = current_value_21_groups_0, pad = current_value_21_pad_0, pad_type = current_value_21_pad_type_0, strides = current_value_21_strides_0, weight = layers_10_self_attn_v_proj_weight_to_fp16_palettized, x = obj_85_cast_fp16)[name = string("current_value_21_cast_fp16")];
+            tensor<int32, [4]> var_4101 = const()[name = string("op_4101"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_81_cast_fp16 = reshape(shape = var_4101, x = query_61_cast_fp16)[name = string("inputs_81_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_83_cast_fp16 = mul(x = inputs_81_cast_fp16, y = inputs_81_cast_fp16)[name = string("inputs_sq_83_cast_fp16")];
+            tensor<int32, [1]> variance_83_axes_0 = const()[name = string("variance_83_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_83_keep_dims_0 = const()[name = string("variance_83_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_83_cast_fp16 = reduce_mean(axes = variance_83_axes_0, keep_dims = variance_83_keep_dims_0, x = inputs_sq_83_cast_fp16)[name = string("variance_83_cast_fp16")];
+            fp16 var_4107_to_fp16 = const()[name = string("op_4107_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_4108_cast_fp16 = add(x = variance_83_cast_fp16, y = var_4107_to_fp16)[name = string("op_4108_cast_fp16")];
+            fp32 var_4109_epsilon_0 = const()[name = string("op_4109_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_4109_cast_fp16 = rsqrt(epsilon = var_4109_epsilon_0, x = var_4108_cast_fp16)[name = string("op_4109_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_103_cast_fp16 = mul(x = inputs_81_cast_fp16, y = var_4109_cast_fp16)[name = string("hidden_states_103_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_83_to_fp16 = const()[name = string("w_83_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161651712)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_21_cast_fp16 = mul(x = w_83_to_fp16, y = hidden_states_103_cast_fp16)[name = string("query_normed_21_cast_fp16")];
+            tensor<int32, [4]> var_4117 = const()[name = string("op_4117"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_83_cast_fp16 = reshape(shape = var_4117, x = current_key_41_cast_fp16)[name = string("inputs_83_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_85_cast_fp16 = mul(x = inputs_83_cast_fp16, y = inputs_83_cast_fp16)[name = string("inputs_sq_85_cast_fp16")];
+            tensor<int32, [1]> variance_85_axes_0 = const()[name = string("variance_85_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_85_keep_dims_0 = const()[name = string("variance_85_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_85_cast_fp16 = reduce_mean(axes = variance_85_axes_0, keep_dims = variance_85_keep_dims_0, x = inputs_sq_85_cast_fp16)[name = string("variance_85_cast_fp16")];
+            fp16 var_4123_to_fp16 = const()[name = string("op_4123_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_4124_cast_fp16 = add(x = variance_85_cast_fp16, y = var_4123_to_fp16)[name = string("op_4124_cast_fp16")];
+            fp32 var_4125_epsilon_0 = const()[name = string("op_4125_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_4125_cast_fp16 = rsqrt(epsilon = var_4125_epsilon_0, x = var_4124_cast_fp16)[name = string("op_4125_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_105_cast_fp16 = mul(x = inputs_83_cast_fp16, y = var_4125_cast_fp16)[name = string("hidden_states_105_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_85_to_fp16 = const()[name = string("w_85_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161652032)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_21_cast_fp16 = mul(x = w_85_to_fp16, y = hidden_states_105_cast_fp16)[name = string("current_key_normed_21_cast_fp16")];
+            tensor<int32, [4]> var_4143 = const()[name = string("op_4143"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_61_cast_fp16 = reshape(shape = var_4143, x = query_normed_21_cast_fp16)[name = string("mh_q_61_cast_fp16")];
+            tensor<int32, [4]> var_4145 = const()[name = string("op_4145"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_41_cast_fp16 = reshape(shape = var_4145, x = current_key_normed_21_cast_fp16)[name = string("mh_k_41_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4149_cast_fp16 = mul(x = mh_q_61_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4149_cast_fp16")];
+            tensor<int32, [4]> var_4154_begin_0 = const()[name = string("op_4154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4154_end_0 = const()[name = string("op_4154_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_4154_end_mask_0 = const()[name = string("op_4154_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4154_cast_fp16 = slice_by_index(begin = var_4154_begin_0, end = var_4154_end_0, end_mask = var_4154_end_mask_0, x = mh_q_61_cast_fp16)[name = string("op_4154_cast_fp16")];
+            tensor<int32, [4]> var_4160_begin_0 = const()[name = string("op_4160_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4160_end_0 = const()[name = string("op_4160_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_4160_end_mask_0 = const()[name = string("op_4160_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4160_cast_fp16 = slice_by_index(begin = var_4160_begin_0, end = var_4160_end_0, end_mask = var_4160_end_mask_0, x = mh_q_61_cast_fp16)[name = string("op_4160_cast_fp16")];
+            fp16 const_247_promoted_to_fp16 = const()[name = string("const_247_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_4162_cast_fp16 = mul(x = var_4160_cast_fp16, y = const_247_promoted_to_fp16)[name = string("op_4162_cast_fp16")];
+            bool var_4164_interleave_0 = const()[name = string("op_4164_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_4164_cast_fp16 = concat(axis = var_4042, interleave = var_4164_interleave_0, values = (var_4162_cast_fp16, var_4154_cast_fp16))[name = string("op_4164_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4165_cast_fp16 = mul(x = var_4164_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4165_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_63_cast_fp16 = add(x = var_4149_cast_fp16, y = var_4165_cast_fp16)[name = string("mh_q_63_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4167_cast_fp16 = mul(x = mh_k_41_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4167_cast_fp16")];
+            tensor<int32, [4]> var_4172_begin_0 = const()[name = string("op_4172_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4172_end_0 = const()[name = string("op_4172_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_4172_end_mask_0 = const()[name = string("op_4172_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4172_cast_fp16 = slice_by_index(begin = var_4172_begin_0, end = var_4172_end_0, end_mask = var_4172_end_mask_0, x = mh_k_41_cast_fp16)[name = string("op_4172_cast_fp16")];
+            tensor<int32, [4]> var_4178_begin_0 = const()[name = string("op_4178_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4178_end_0 = const()[name = string("op_4178_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_4178_end_mask_0 = const()[name = string("op_4178_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4178_cast_fp16 = slice_by_index(begin = var_4178_begin_0, end = var_4178_end_0, end_mask = var_4178_end_mask_0, x = mh_k_41_cast_fp16)[name = string("op_4178_cast_fp16")];
+            fp16 const_250_promoted_to_fp16 = const()[name = string("const_250_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_4180_cast_fp16 = mul(x = var_4178_cast_fp16, y = const_250_promoted_to_fp16)[name = string("op_4180_cast_fp16")];
+            bool var_4182_interleave_0 = const()[name = string("op_4182_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_4182_cast_fp16 = concat(axis = var_4042, interleave = var_4182_interleave_0, values = (var_4180_cast_fp16, var_4172_cast_fp16))[name = string("op_4182_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4183_cast_fp16 = mul(x = var_4182_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4183_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_43_cast_fp16 = add(x = var_4167_cast_fp16, y = var_4183_cast_fp16)[name = string("mh_k_43_cast_fp16")];
+            tensor<int32, [4]> var_4187 = const()[name = string("op_4187"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_43_cast_fp16 = reshape(shape = var_4187, x = mh_k_43_cast_fp16)[name = string("current_key_43_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4194_cast_fp16 = mul(x = var_101_cast_fp16_10, y = var_323_cast_fp16)[name = string("op_4194_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4195_cast_fp16 = mul(x = current_key_43_cast_fp16, y = var_321_cast_fp16)[name = string("op_4195_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_63_cast_fp16 = add(x = var_4194_cast_fp16, y = var_4195_cast_fp16)[name = string("key_63_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4198_cast_fp16 = mul(x = var_132_cast_fp16_10, y = var_323_cast_fp16)[name = string("op_4198_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4199_cast_fp16 = mul(x = current_value_21_cast_fp16, y = var_321_cast_fp16)[name = string("op_4199_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_41_cast_fp16 = add(x = var_4198_cast_fp16, y = var_4199_cast_fp16)[name = string("value_41_cast_fp16")];
+            tensor<int32, [4]> var_4203 = const()[name = string("op_4203"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_41_cast_fp16 = reshape(shape = var_4203, x = key_63_cast_fp16)[name = string("key_heads_41_cast_fp16")];
+            tensor<int32, [4]> var_4205 = const()[name = string("op_4205"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_41_cast_fp16 = reshape(shape = var_4205, x = value_41_cast_fp16)[name = string("value_heads_41_cast_fp16")];
+            tensor<int32, [4]> var_4208_begin_0 = const()[name = string("op_4208_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4208_end_0 = const()[name = string("op_4208_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4208_end_mask_0 = const()[name = string("op_4208_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4208_cast_fp16 = slice_by_index(begin = var_4208_begin_0, end = var_4208_end_0, end_mask = var_4208_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4208_cast_fp16")];
+            tensor<int32, [4]> var_4212_begin_0 = const()[name = string("op_4212_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4212_end_0 = const()[name = string("op_4212_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4212_end_mask_0 = const()[name = string("op_4212_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4212_cast_fp16 = slice_by_index(begin = var_4212_begin_0, end = var_4212_end_0, end_mask = var_4212_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4212_cast_fp16")];
+            tensor<int32, [4]> var_4224_begin_0 = const()[name = string("op_4224_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4224_end_0 = const()[name = string("op_4224_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4224_end_mask_0 = const()[name = string("op_4224_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4224_cast_fp16 = slice_by_index(begin = var_4224_begin_0, end = var_4224_end_0, end_mask = var_4224_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4224_cast_fp16")];
+            tensor<int32, [4]> var_4228_begin_0 = const()[name = string("op_4228_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4228_end_0 = const()[name = string("op_4228_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4228_end_mask_0 = const()[name = string("op_4228_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4228_cast_fp16 = slice_by_index(begin = var_4228_begin_0, end = var_4228_end_0, end_mask = var_4228_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4228_cast_fp16")];
+            tensor<int32, [4]> var_4240_begin_0 = const()[name = string("op_4240_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4240_end_0 = const()[name = string("op_4240_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4240_end_mask_0 = const()[name = string("op_4240_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4240_cast_fp16 = slice_by_index(begin = var_4240_begin_0, end = var_4240_end_0, end_mask = var_4240_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4240_cast_fp16")];
+            tensor<int32, [4]> var_4244_begin_0 = const()[name = string("op_4244_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4244_end_0 = const()[name = string("op_4244_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4244_end_mask_0 = const()[name = string("op_4244_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4244_cast_fp16 = slice_by_index(begin = var_4244_begin_0, end = var_4244_end_0, end_mask = var_4244_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4244_cast_fp16")];
+            tensor<int32, [4]> var_4256_begin_0 = const()[name = string("op_4256_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4256_end_0 = const()[name = string("op_4256_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4256_end_mask_0 = const()[name = string("op_4256_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4256_cast_fp16 = slice_by_index(begin = var_4256_begin_0, end = var_4256_end_0, end_mask = var_4256_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4256_cast_fp16")];
+            tensor<int32, [4]> var_4260_begin_0 = const()[name = string("op_4260_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4260_end_0 = const()[name = string("op_4260_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4260_end_mask_0 = const()[name = string("op_4260_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4260_cast_fp16 = slice_by_index(begin = var_4260_begin_0, end = var_4260_end_0, end_mask = var_4260_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4260_cast_fp16")];
+            tensor<int32, [4]> var_4272_begin_0 = const()[name = string("op_4272_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4272_end_0 = const()[name = string("op_4272_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4272_end_mask_0 = const()[name = string("op_4272_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4272_cast_fp16 = slice_by_index(begin = var_4272_begin_0, end = var_4272_end_0, end_mask = var_4272_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4272_cast_fp16")];
+            tensor<int32, [4]> var_4276_begin_0 = const()[name = string("op_4276_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4276_end_0 = const()[name = string("op_4276_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4276_end_mask_0 = const()[name = string("op_4276_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4276_cast_fp16 = slice_by_index(begin = var_4276_begin_0, end = var_4276_end_0, end_mask = var_4276_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4276_cast_fp16")];
+            tensor<int32, [4]> var_4288_begin_0 = const()[name = string("op_4288_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4288_end_0 = const()[name = string("op_4288_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4288_end_mask_0 = const()[name = string("op_4288_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4288_cast_fp16 = slice_by_index(begin = var_4288_begin_0, end = var_4288_end_0, end_mask = var_4288_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4288_cast_fp16")];
+            tensor<int32, [4]> var_4292_begin_0 = const()[name = string("op_4292_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4292_end_0 = const()[name = string("op_4292_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4292_end_mask_0 = const()[name = string("op_4292_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4292_cast_fp16 = slice_by_index(begin = var_4292_begin_0, end = var_4292_end_0, end_mask = var_4292_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4292_cast_fp16")];
+            tensor<int32, [4]> var_4304_begin_0 = const()[name = string("op_4304_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4304_end_0 = const()[name = string("op_4304_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4304_end_mask_0 = const()[name = string("op_4304_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4304_cast_fp16 = slice_by_index(begin = var_4304_begin_0, end = var_4304_end_0, end_mask = var_4304_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4304_cast_fp16")];
+            tensor<int32, [4]> var_4308_begin_0 = const()[name = string("op_4308_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4308_end_0 = const()[name = string("op_4308_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4308_end_mask_0 = const()[name = string("op_4308_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4308_cast_fp16 = slice_by_index(begin = var_4308_begin_0, end = var_4308_end_0, end_mask = var_4308_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4308_cast_fp16")];
+            tensor<int32, [4]> var_4320_begin_0 = const()[name = string("op_4320_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4320_end_0 = const()[name = string("op_4320_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4320_end_mask_0 = const()[name = string("op_4320_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4320_cast_fp16 = slice_by_index(begin = var_4320_begin_0, end = var_4320_end_0, end_mask = var_4320_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4320_cast_fp16")];
+            tensor<int32, [4]> var_4324_begin_0 = const()[name = string("op_4324_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4324_end_0 = const()[name = string("op_4324_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4324_end_mask_0 = const()[name = string("op_4324_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4324_cast_fp16 = slice_by_index(begin = var_4324_begin_0, end = var_4324_end_0, end_mask = var_4324_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4324_cast_fp16")];
+            bool key_heads_43_interleave_0 = const()[name = string("key_heads_43_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_43_cast_fp16 = concat(axis = var_4050, interleave = key_heads_43_interleave_0, values = (var_4208_cast_fp16, var_4208_cast_fp16, var_4224_cast_fp16, var_4224_cast_fp16, var_4240_cast_fp16, var_4240_cast_fp16, var_4256_cast_fp16, var_4256_cast_fp16, var_4272_cast_fp16, var_4272_cast_fp16, var_4288_cast_fp16, var_4288_cast_fp16, var_4304_cast_fp16, var_4304_cast_fp16, var_4320_cast_fp16, var_4320_cast_fp16))[name = string("key_heads_43_cast_fp16")];
+            bool value_heads_43_interleave_0 = const()[name = string("value_heads_43_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_43_cast_fp16 = concat(axis = var_4050, interleave = value_heads_43_interleave_0, values = (var_4212_cast_fp16, var_4212_cast_fp16, var_4228_cast_fp16, var_4228_cast_fp16, var_4244_cast_fp16, var_4244_cast_fp16, var_4260_cast_fp16, var_4260_cast_fp16, var_4276_cast_fp16, var_4276_cast_fp16, var_4292_cast_fp16, var_4292_cast_fp16, var_4308_cast_fp16, var_4308_cast_fp16, var_4324_cast_fp16, var_4324_cast_fp16))[name = string("value_heads_43_cast_fp16")];
+            fp16 var_4347_to_fp16 = const()[name = string("op_4347_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_4348_cast_fp16 = mul(x = mh_q_63_cast_fp16, y = var_4347_to_fp16)[name = string("op_4348_cast_fp16")];
+            bool mh_w_41_transpose_x_0 = const()[name = string("mh_w_41_transpose_x_0"), val = bool(true)];
+            bool mh_w_41_transpose_y_0 = const()[name = string("mh_w_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_41_cast_fp16 = matmul(transpose_x = mh_w_41_transpose_x_0, transpose_y = mh_w_41_transpose_y_0, x = var_4348_cast_fp16, y = key_heads_43_cast_fp16)[name = string("mh_w_41_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_43_cast_fp16 = add(x = mh_w_41_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_43_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_4360_cast_fp16 = softmax(axis = var_4032, x = mh_w_43_cast_fp16)[name = string("op_4360_cast_fp16")];
+            bool attn_21_transpose_x_0 = const()[name = string("attn_21_transpose_x_0"), val = bool(false)];
+            bool attn_21_transpose_y_0 = const()[name = string("attn_21_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_21_cast_fp16 = matmul(transpose_x = attn_21_transpose_x_0, transpose_y = attn_21_transpose_y_0, x = value_heads_43_cast_fp16, y = var_4360_cast_fp16)[name = string("attn_21_cast_fp16")];
+            tensor<int32, [4]> var_4365 = const()[name = string("op_4365"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_81_cast_fp16 = reshape(shape = var_4365, x = attn_21_cast_fp16)[name = string("input_81_cast_fp16")];
+            string obj_91_pad_type_0 = const()[name = string("obj_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_91_strides_0 = const()[name = string("obj_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_91_pad_0 = const()[name = string("obj_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_91_dilations_0 = const()[name = string("obj_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_91_groups_0 = const()[name = string("obj_91_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_10_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161652352))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163749568))))[name = string("layers_10_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_91_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_91_dilations_0, groups = obj_91_groups_0, pad = obj_91_pad_0, pad_type = obj_91_pad_type_0, strides = obj_91_strides_0, weight = layers_10_self_attn_o_proj_weight_to_fp16_palettized, x = input_81_cast_fp16)[name = string("obj_91_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_85_cast_fp16 = add(x = inputs_79_cast_fp16, y = obj_91_cast_fp16)[name = string("inputs_85_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_87_cast_fp16 = mul(x = inputs_85_cast_fp16, y = inputs_85_cast_fp16)[name = string("inputs_sq_87_cast_fp16")];
+            tensor<int32, [1]> variance_87_axes_0 = const()[name = string("variance_87_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_87_keep_dims_0 = const()[name = string("variance_87_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_87_cast_fp16 = reduce_mean(axes = variance_87_axes_0, keep_dims = variance_87_keep_dims_0, x = inputs_sq_87_cast_fp16)[name = string("variance_87_cast_fp16")];
+            fp16 var_4383_to_fp16 = const()[name = string("op_4383_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4384_cast_fp16 = add(x = variance_87_cast_fp16, y = var_4383_to_fp16)[name = string("op_4384_cast_fp16")];
+            fp32 var_4385_epsilon_0 = const()[name = string("op_4385_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4385_cast_fp16 = rsqrt(epsilon = var_4385_epsilon_0, x = var_4384_cast_fp16)[name = string("op_4385_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_107_cast_fp16 = mul(x = inputs_85_cast_fp16, y = var_4385_cast_fp16)[name = string("hidden_states_107_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_87_to_fp16 = const()[name = string("w_87_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163750144)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_83_cast_fp16 = mul(x = w_87_to_fp16, y = hidden_states_107_cast_fp16)[name = string("input_83_cast_fp16")];
+            string input_85_pad_type_0 = const()[name = string("input_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_85_strides_0 = const()[name = string("input_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_85_pad_0 = const()[name = string("input_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_85_dilations_0 = const()[name = string("input_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_85_groups_0 = const()[name = string("input_85_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_10_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163752256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(166898048))))[name = string("layers_10_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_85_cast_fp16 = conv(dilations = input_85_dilations_0, groups = input_85_groups_0, pad = input_85_pad_0, pad_type = input_85_pad_type_0, strides = input_85_strides_0, weight = layers_10_mlp_gate_proj_weight_to_fp16_palettized, x = input_83_cast_fp16)[name = string("input_85_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_4399_cast_fp16 = silu(x = input_85_cast_fp16)[name = string("op_4399_cast_fp16")];
+            string var_4405_pad_type_0 = const()[name = string("op_4405_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4405_strides_0 = const()[name = string("op_4405_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4405_pad_0 = const()[name = string("op_4405_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4405_dilations_0 = const()[name = string("op_4405_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4405_groups_0 = const()[name = string("op_4405_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_10_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(166898624))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(170044416))))[name = string("layers_10_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_4405_cast_fp16 = conv(dilations = var_4405_dilations_0, groups = var_4405_groups_0, pad = var_4405_pad_0, pad_type = var_4405_pad_type_0, strides = var_4405_strides_0, weight = layers_10_mlp_up_proj_weight_to_fp16_palettized, x = input_83_cast_fp16)[name = string("op_4405_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_87_cast_fp16 = mul(x = var_4399_cast_fp16, y = var_4405_cast_fp16)[name = string("input_87_cast_fp16")];
+            string hidden_states_109_pad_type_0 = const()[name = string("hidden_states_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_109_strides_0 = const()[name = string("hidden_states_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_109_pad_0 = const()[name = string("hidden_states_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_109_dilations_0 = const()[name = string("hidden_states_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_109_groups_0 = const()[name = string("hidden_states_109_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_10_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(170044992))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(173190784))))[name = string("layers_10_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_109_cast_fp16 = conv(dilations = hidden_states_109_dilations_0, groups = hidden_states_109_groups_0, pad = hidden_states_109_pad_0, pad_type = hidden_states_109_pad_type_0, strides = hidden_states_109_strides_0, weight = layers_10_mlp_down_proj_weight_to_fp16_palettized, x = input_87_cast_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_87_cast_fp16 = add(x = inputs_85_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("inputs_87_cast_fp16")];
+            int32 var_4419 = const()[name = string("op_4419"), val = int32(3)];
+            int32 var_4429 = const()[name = string("op_4429"), val = int32(-2)];
+            int32 var_4437 = const()[name = string("op_4437"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_89_cast_fp16 = mul(x = inputs_87_cast_fp16, y = inputs_87_cast_fp16)[name = string("inputs_sq_89_cast_fp16")];
+            tensor<int32, [1]> variance_89_axes_0 = const()[name = string("variance_89_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_89_keep_dims_0 = const()[name = string("variance_89_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_89_cast_fp16 = reduce_mean(axes = variance_89_axes_0, keep_dims = variance_89_keep_dims_0, x = inputs_sq_89_cast_fp16)[name = string("variance_89_cast_fp16")];
+            fp16 var_4449_to_fp16 = const()[name = string("op_4449_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4450_cast_fp16 = add(x = variance_89_cast_fp16, y = var_4449_to_fp16)[name = string("op_4450_cast_fp16")];
+            fp32 var_4451_epsilon_0 = const()[name = string("op_4451_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4451_cast_fp16 = rsqrt(epsilon = var_4451_epsilon_0, x = var_4450_cast_fp16)[name = string("op_4451_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_111_cast_fp16 = mul(x = inputs_87_cast_fp16, y = var_4451_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_89_to_fp16 = const()[name = string("w_89_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(173191360)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_93_cast_fp16 = mul(x = w_89_to_fp16, y = hidden_states_111_cast_fp16)[name = string("obj_93_cast_fp16")];
+            string query_67_pad_type_0 = const()[name = string("query_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_67_strides_0 = const()[name = string("query_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_67_pad_0 = const()[name = string("query_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_67_dilations_0 = const()[name = string("query_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_67_groups_0 = const()[name = string("query_67_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_11_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(173193472))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(175290688))))[name = string("layers_11_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_67_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_67_dilations_0, groups = query_67_groups_0, pad = query_67_pad_0, pad_type = query_67_pad_type_0, strides = query_67_strides_0, weight = layers_11_self_attn_q_proj_weight_to_fp16_palettized, x = obj_93_cast_fp16)[name = string("query_67_cast_fp16")];
+            string current_key_45_pad_type_0 = const()[name = string("current_key_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_45_strides_0 = const()[name = string("current_key_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_45_pad_0 = const()[name = string("current_key_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_45_dilations_0 = const()[name = string("current_key_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_45_groups_0 = const()[name = string("current_key_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(175291264))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(176339904))))[name = string("layers_11_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_45_cast_fp16 = conv(dilations = current_key_45_dilations_0, groups = current_key_45_groups_0, pad = current_key_45_pad_0, pad_type = current_key_45_pad_type_0, strides = current_key_45_strides_0, weight = layers_11_self_attn_k_proj_weight_to_fp16_palettized, x = obj_93_cast_fp16)[name = string("current_key_45_cast_fp16")];
+            string current_value_23_pad_type_0 = const()[name = string("current_value_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_23_strides_0 = const()[name = string("current_value_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_23_pad_0 = const()[name = string("current_value_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_23_dilations_0 = const()[name = string("current_value_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_23_groups_0 = const()[name = string("current_value_23_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(176340480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177389120))))[name = string("layers_11_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_23_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_23_dilations_0, groups = current_value_23_groups_0, pad = current_value_23_pad_0, pad_type = current_value_23_pad_type_0, strides = current_value_23_strides_0, weight = layers_11_self_attn_v_proj_weight_to_fp16_palettized, x = obj_93_cast_fp16)[name = string("current_value_23_cast_fp16")];
+            tensor<int32, [4]> var_4488 = const()[name = string("op_4488"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_89_cast_fp16 = reshape(shape = var_4488, x = query_67_cast_fp16)[name = string("inputs_89_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_91_cast_fp16 = mul(x = inputs_89_cast_fp16, y = inputs_89_cast_fp16)[name = string("inputs_sq_91_cast_fp16")];
+            tensor<int32, [1]> variance_91_axes_0 = const()[name = string("variance_91_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_91_keep_dims_0 = const()[name = string("variance_91_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_91_cast_fp16 = reduce_mean(axes = variance_91_axes_0, keep_dims = variance_91_keep_dims_0, x = inputs_sq_91_cast_fp16)[name = string("variance_91_cast_fp16")];
+            fp16 var_4494_to_fp16 = const()[name = string("op_4494_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_4495_cast_fp16 = add(x = variance_91_cast_fp16, y = var_4494_to_fp16)[name = string("op_4495_cast_fp16")];
+            fp32 var_4496_epsilon_0 = const()[name = string("op_4496_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_4496_cast_fp16 = rsqrt(epsilon = var_4496_epsilon_0, x = var_4495_cast_fp16)[name = string("op_4496_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_113_cast_fp16 = mul(x = inputs_89_cast_fp16, y = var_4496_cast_fp16)[name = string("hidden_states_113_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_91_to_fp16 = const()[name = string("w_91_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177389696)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_23_cast_fp16 = mul(x = w_91_to_fp16, y = hidden_states_113_cast_fp16)[name = string("query_normed_23_cast_fp16")];
+            tensor<int32, [4]> var_4504 = const()[name = string("op_4504"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_91_cast_fp16 = reshape(shape = var_4504, x = current_key_45_cast_fp16)[name = string("inputs_91_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_93_cast_fp16 = mul(x = inputs_91_cast_fp16, y = inputs_91_cast_fp16)[name = string("inputs_sq_93_cast_fp16")];
+            tensor<int32, [1]> variance_93_axes_0 = const()[name = string("variance_93_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_93_keep_dims_0 = const()[name = string("variance_93_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_93_cast_fp16 = reduce_mean(axes = variance_93_axes_0, keep_dims = variance_93_keep_dims_0, x = inputs_sq_93_cast_fp16)[name = string("variance_93_cast_fp16")];
+            fp16 var_4510_to_fp16 = const()[name = string("op_4510_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_4511_cast_fp16 = add(x = variance_93_cast_fp16, y = var_4510_to_fp16)[name = string("op_4511_cast_fp16")];
+            fp32 var_4512_epsilon_0 = const()[name = string("op_4512_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_4512_cast_fp16 = rsqrt(epsilon = var_4512_epsilon_0, x = var_4511_cast_fp16)[name = string("op_4512_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_115_cast_fp16 = mul(x = inputs_91_cast_fp16, y = var_4512_cast_fp16)[name = string("hidden_states_115_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_93_to_fp16 = const()[name = string("w_93_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177390016)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_23_cast_fp16 = mul(x = w_93_to_fp16, y = hidden_states_115_cast_fp16)[name = string("current_key_normed_23_cast_fp16")];
+            tensor<int32, [4]> var_4530 = const()[name = string("op_4530"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_67_cast_fp16 = reshape(shape = var_4530, x = query_normed_23_cast_fp16)[name = string("mh_q_67_cast_fp16")];
+            tensor<int32, [4]> var_4532 = const()[name = string("op_4532"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_45_cast_fp16 = reshape(shape = var_4532, x = current_key_normed_23_cast_fp16)[name = string("mh_k_45_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4536_cast_fp16 = mul(x = mh_q_67_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4536_cast_fp16")];
+            tensor<int32, [4]> var_4541_begin_0 = const()[name = string("op_4541_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4541_end_0 = const()[name = string("op_4541_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_4541_end_mask_0 = const()[name = string("op_4541_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4541_cast_fp16 = slice_by_index(begin = var_4541_begin_0, end = var_4541_end_0, end_mask = var_4541_end_mask_0, x = mh_q_67_cast_fp16)[name = string("op_4541_cast_fp16")];
+            tensor<int32, [4]> var_4547_begin_0 = const()[name = string("op_4547_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4547_end_0 = const()[name = string("op_4547_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_4547_end_mask_0 = const()[name = string("op_4547_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4547_cast_fp16 = slice_by_index(begin = var_4547_begin_0, end = var_4547_end_0, end_mask = var_4547_end_mask_0, x = mh_q_67_cast_fp16)[name = string("op_4547_cast_fp16")];
+            fp16 const_270_promoted_to_fp16 = const()[name = string("const_270_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_4549_cast_fp16 = mul(x = var_4547_cast_fp16, y = const_270_promoted_to_fp16)[name = string("op_4549_cast_fp16")];
+            bool var_4551_interleave_0 = const()[name = string("op_4551_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_4551_cast_fp16 = concat(axis = var_4429, interleave = var_4551_interleave_0, values = (var_4549_cast_fp16, var_4541_cast_fp16))[name = string("op_4551_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4552_cast_fp16 = mul(x = var_4551_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4552_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_69_cast_fp16 = add(x = var_4536_cast_fp16, y = var_4552_cast_fp16)[name = string("mh_q_69_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4554_cast_fp16 = mul(x = mh_k_45_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4554_cast_fp16")];
+            tensor<int32, [4]> var_4559_begin_0 = const()[name = string("op_4559_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4559_end_0 = const()[name = string("op_4559_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_4559_end_mask_0 = const()[name = string("op_4559_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4559_cast_fp16 = slice_by_index(begin = var_4559_begin_0, end = var_4559_end_0, end_mask = var_4559_end_mask_0, x = mh_k_45_cast_fp16)[name = string("op_4559_cast_fp16")];
+            tensor<int32, [4]> var_4565_begin_0 = const()[name = string("op_4565_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4565_end_0 = const()[name = string("op_4565_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_4565_end_mask_0 = const()[name = string("op_4565_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4565_cast_fp16 = slice_by_index(begin = var_4565_begin_0, end = var_4565_end_0, end_mask = var_4565_end_mask_0, x = mh_k_45_cast_fp16)[name = string("op_4565_cast_fp16")];
+            fp16 const_273_promoted_to_fp16 = const()[name = string("const_273_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_4567_cast_fp16 = mul(x = var_4565_cast_fp16, y = const_273_promoted_to_fp16)[name = string("op_4567_cast_fp16")];
+            bool var_4569_interleave_0 = const()[name = string("op_4569_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_4569_cast_fp16 = concat(axis = var_4429, interleave = var_4569_interleave_0, values = (var_4567_cast_fp16, var_4559_cast_fp16))[name = string("op_4569_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4570_cast_fp16 = mul(x = var_4569_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4570_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_47_cast_fp16 = add(x = var_4554_cast_fp16, y = var_4570_cast_fp16)[name = string("mh_k_47_cast_fp16")];
+            tensor<int32, [4]> var_4574 = const()[name = string("op_4574"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_47_cast_fp16 = reshape(shape = var_4574, x = mh_k_47_cast_fp16)[name = string("current_key_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4581_cast_fp16 = mul(x = var_101_cast_fp16_11, y = var_323_cast_fp16)[name = string("op_4581_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4582_cast_fp16 = mul(x = current_key_47_cast_fp16, y = var_321_cast_fp16)[name = string("op_4582_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_69_cast_fp16 = add(x = var_4581_cast_fp16, y = var_4582_cast_fp16)[name = string("key_69_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4585_cast_fp16 = mul(x = var_132_cast_fp16_11, y = var_323_cast_fp16)[name = string("op_4585_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4586_cast_fp16 = mul(x = current_value_23_cast_fp16, y = var_321_cast_fp16)[name = string("op_4586_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_45_cast_fp16 = add(x = var_4585_cast_fp16, y = var_4586_cast_fp16)[name = string("value_45_cast_fp16")];
+            tensor<int32, [4]> var_4590 = const()[name = string("op_4590"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_45_cast_fp16 = reshape(shape = var_4590, x = key_69_cast_fp16)[name = string("key_heads_45_cast_fp16")];
+            tensor<int32, [4]> var_4592 = const()[name = string("op_4592"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_45_cast_fp16 = reshape(shape = var_4592, x = value_45_cast_fp16)[name = string("value_heads_45_cast_fp16")];
+            tensor<int32, [4]> var_4595_begin_0 = const()[name = string("op_4595_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4595_end_0 = const()[name = string("op_4595_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4595_end_mask_0 = const()[name = string("op_4595_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4595_cast_fp16 = slice_by_index(begin = var_4595_begin_0, end = var_4595_end_0, end_mask = var_4595_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4595_cast_fp16")];
+            tensor<int32, [4]> var_4599_begin_0 = const()[name = string("op_4599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4599_end_0 = const()[name = string("op_4599_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4599_end_mask_0 = const()[name = string("op_4599_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4599_cast_fp16 = slice_by_index(begin = var_4599_begin_0, end = var_4599_end_0, end_mask = var_4599_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4599_cast_fp16")];
+            tensor<int32, [4]> var_4611_begin_0 = const()[name = string("op_4611_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4611_end_0 = const()[name = string("op_4611_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4611_end_mask_0 = const()[name = string("op_4611_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4611_cast_fp16 = slice_by_index(begin = var_4611_begin_0, end = var_4611_end_0, end_mask = var_4611_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4611_cast_fp16")];
+            tensor<int32, [4]> var_4615_begin_0 = const()[name = string("op_4615_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4615_end_0 = const()[name = string("op_4615_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4615_end_mask_0 = const()[name = string("op_4615_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4615_cast_fp16 = slice_by_index(begin = var_4615_begin_0, end = var_4615_end_0, end_mask = var_4615_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4615_cast_fp16")];
+            tensor<int32, [4]> var_4627_begin_0 = const()[name = string("op_4627_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4627_end_0 = const()[name = string("op_4627_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4627_end_mask_0 = const()[name = string("op_4627_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4627_cast_fp16 = slice_by_index(begin = var_4627_begin_0, end = var_4627_end_0, end_mask = var_4627_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4627_cast_fp16")];
+            tensor<int32, [4]> var_4631_begin_0 = const()[name = string("op_4631_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4631_end_0 = const()[name = string("op_4631_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4631_end_mask_0 = const()[name = string("op_4631_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4631_cast_fp16 = slice_by_index(begin = var_4631_begin_0, end = var_4631_end_0, end_mask = var_4631_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4631_cast_fp16")];
+            tensor<int32, [4]> var_4643_begin_0 = const()[name = string("op_4643_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4643_end_0 = const()[name = string("op_4643_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4643_end_mask_0 = const()[name = string("op_4643_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4643_cast_fp16 = slice_by_index(begin = var_4643_begin_0, end = var_4643_end_0, end_mask = var_4643_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4643_cast_fp16")];
+            tensor<int32, [4]> var_4647_begin_0 = const()[name = string("op_4647_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4647_end_0 = const()[name = string("op_4647_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4647_end_mask_0 = const()[name = string("op_4647_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4647_cast_fp16 = slice_by_index(begin = var_4647_begin_0, end = var_4647_end_0, end_mask = var_4647_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4647_cast_fp16")];
+            tensor<int32, [4]> var_4659_begin_0 = const()[name = string("op_4659_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4659_end_0 = const()[name = string("op_4659_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4659_end_mask_0 = const()[name = string("op_4659_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4659_cast_fp16 = slice_by_index(begin = var_4659_begin_0, end = var_4659_end_0, end_mask = var_4659_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4659_cast_fp16")];
+            tensor<int32, [4]> var_4663_begin_0 = const()[name = string("op_4663_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4663_end_0 = const()[name = string("op_4663_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4663_end_mask_0 = const()[name = string("op_4663_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4663_cast_fp16 = slice_by_index(begin = var_4663_begin_0, end = var_4663_end_0, end_mask = var_4663_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4663_cast_fp16")];
+            tensor<int32, [4]> var_4675_begin_0 = const()[name = string("op_4675_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4675_end_0 = const()[name = string("op_4675_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4675_end_mask_0 = const()[name = string("op_4675_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4675_cast_fp16 = slice_by_index(begin = var_4675_begin_0, end = var_4675_end_0, end_mask = var_4675_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4675_cast_fp16")];
+            tensor<int32, [4]> var_4679_begin_0 = const()[name = string("op_4679_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4679_end_0 = const()[name = string("op_4679_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4679_end_mask_0 = const()[name = string("op_4679_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4679_cast_fp16 = slice_by_index(begin = var_4679_begin_0, end = var_4679_end_0, end_mask = var_4679_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4679_cast_fp16")];
+            tensor<int32, [4]> var_4691_begin_0 = const()[name = string("op_4691_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4691_end_0 = const()[name = string("op_4691_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4691_end_mask_0 = const()[name = string("op_4691_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4691_cast_fp16 = slice_by_index(begin = var_4691_begin_0, end = var_4691_end_0, end_mask = var_4691_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4691_cast_fp16")];
+            tensor<int32, [4]> var_4695_begin_0 = const()[name = string("op_4695_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4695_end_0 = const()[name = string("op_4695_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4695_end_mask_0 = const()[name = string("op_4695_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4695_cast_fp16 = slice_by_index(begin = var_4695_begin_0, end = var_4695_end_0, end_mask = var_4695_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4695_cast_fp16")];
+            tensor<int32, [4]> var_4707_begin_0 = const()[name = string("op_4707_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4707_end_0 = const()[name = string("op_4707_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4707_end_mask_0 = const()[name = string("op_4707_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4707_cast_fp16 = slice_by_index(begin = var_4707_begin_0, end = var_4707_end_0, end_mask = var_4707_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4707_cast_fp16")];
+            tensor<int32, [4]> var_4711_begin_0 = const()[name = string("op_4711_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4711_end_0 = const()[name = string("op_4711_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4711_end_mask_0 = const()[name = string("op_4711_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4711_cast_fp16 = slice_by_index(begin = var_4711_begin_0, end = var_4711_end_0, end_mask = var_4711_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4711_cast_fp16")];
+            bool key_heads_47_interleave_0 = const()[name = string("key_heads_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_47_cast_fp16 = concat(axis = var_4437, interleave = key_heads_47_interleave_0, values = (var_4595_cast_fp16, var_4595_cast_fp16, var_4611_cast_fp16, var_4611_cast_fp16, var_4627_cast_fp16, var_4627_cast_fp16, var_4643_cast_fp16, var_4643_cast_fp16, var_4659_cast_fp16, var_4659_cast_fp16, var_4675_cast_fp16, var_4675_cast_fp16, var_4691_cast_fp16, var_4691_cast_fp16, var_4707_cast_fp16, var_4707_cast_fp16))[name = string("key_heads_47_cast_fp16")];
+            bool value_heads_47_interleave_0 = const()[name = string("value_heads_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_47_cast_fp16 = concat(axis = var_4437, interleave = value_heads_47_interleave_0, values = (var_4599_cast_fp16, var_4599_cast_fp16, var_4615_cast_fp16, var_4615_cast_fp16, var_4631_cast_fp16, var_4631_cast_fp16, var_4647_cast_fp16, var_4647_cast_fp16, var_4663_cast_fp16, var_4663_cast_fp16, var_4679_cast_fp16, var_4679_cast_fp16, var_4695_cast_fp16, var_4695_cast_fp16, var_4711_cast_fp16, var_4711_cast_fp16))[name = string("value_heads_47_cast_fp16")];
+            fp16 var_4734_to_fp16 = const()[name = string("op_4734_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_4735_cast_fp16 = mul(x = mh_q_69_cast_fp16, y = var_4734_to_fp16)[name = string("op_4735_cast_fp16")];
+            bool mh_w_45_transpose_x_0 = const()[name = string("mh_w_45_transpose_x_0"), val = bool(true)];
+            bool mh_w_45_transpose_y_0 = const()[name = string("mh_w_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_45_cast_fp16 = matmul(transpose_x = mh_w_45_transpose_x_0, transpose_y = mh_w_45_transpose_y_0, x = var_4735_cast_fp16, y = key_heads_47_cast_fp16)[name = string("mh_w_45_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_47_cast_fp16 = add(x = mh_w_45_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_47_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_4747_cast_fp16 = softmax(axis = var_4419, x = mh_w_47_cast_fp16)[name = string("op_4747_cast_fp16")];
+            bool attn_23_transpose_x_0 = const()[name = string("attn_23_transpose_x_0"), val = bool(false)];
+            bool attn_23_transpose_y_0 = const()[name = string("attn_23_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_23_cast_fp16 = matmul(transpose_x = attn_23_transpose_x_0, transpose_y = attn_23_transpose_y_0, x = value_heads_47_cast_fp16, y = var_4747_cast_fp16)[name = string("attn_23_cast_fp16")];
+            tensor<int32, [4]> var_4752 = const()[name = string("op_4752"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_89_cast_fp16 = reshape(shape = var_4752, x = attn_23_cast_fp16)[name = string("input_89_cast_fp16")];
+            string obj_99_pad_type_0 = const()[name = string("obj_99_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_99_strides_0 = const()[name = string("obj_99_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_99_pad_0 = const()[name = string("obj_99_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_99_dilations_0 = const()[name = string("obj_99_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_99_groups_0 = const()[name = string("obj_99_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_11_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177390336))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179487552))))[name = string("layers_11_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_99_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_99_dilations_0, groups = obj_99_groups_0, pad = obj_99_pad_0, pad_type = obj_99_pad_type_0, strides = obj_99_strides_0, weight = layers_11_self_attn_o_proj_weight_to_fp16_palettized, x = input_89_cast_fp16)[name = string("obj_99_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_93_cast_fp16 = add(x = inputs_87_cast_fp16, y = obj_99_cast_fp16)[name = string("inputs_93_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_95_cast_fp16 = mul(x = inputs_93_cast_fp16, y = inputs_93_cast_fp16)[name = string("inputs_sq_95_cast_fp16")];
+            tensor<int32, [1]> variance_95_axes_0 = const()[name = string("variance_95_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_95_keep_dims_0 = const()[name = string("variance_95_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_95_cast_fp16 = reduce_mean(axes = variance_95_axes_0, keep_dims = variance_95_keep_dims_0, x = inputs_sq_95_cast_fp16)[name = string("variance_95_cast_fp16")];
+            fp16 var_4770_to_fp16 = const()[name = string("op_4770_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4771_cast_fp16 = add(x = variance_95_cast_fp16, y = var_4770_to_fp16)[name = string("op_4771_cast_fp16")];
+            fp32 var_4772_epsilon_0 = const()[name = string("op_4772_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4772_cast_fp16 = rsqrt(epsilon = var_4772_epsilon_0, x = var_4771_cast_fp16)[name = string("op_4772_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_117_cast_fp16 = mul(x = inputs_93_cast_fp16, y = var_4772_cast_fp16)[name = string("hidden_states_117_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_95_to_fp16 = const()[name = string("w_95_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179488128)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_91_cast_fp16 = mul(x = w_95_to_fp16, y = hidden_states_117_cast_fp16)[name = string("input_91_cast_fp16")];
+            string input_93_pad_type_0 = const()[name = string("input_93_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_93_strides_0 = const()[name = string("input_93_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_93_pad_0 = const()[name = string("input_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_93_dilations_0 = const()[name = string("input_93_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_93_groups_0 = const()[name = string("input_93_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_11_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(179490240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(182636032))))[name = string("layers_11_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_93_cast_fp16 = conv(dilations = input_93_dilations_0, groups = input_93_groups_0, pad = input_93_pad_0, pad_type = input_93_pad_type_0, strides = input_93_strides_0, weight = layers_11_mlp_gate_proj_weight_to_fp16_palettized, x = input_91_cast_fp16)[name = string("input_93_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_4786_cast_fp16 = silu(x = input_93_cast_fp16)[name = string("op_4786_cast_fp16")];
+            string var_4792_pad_type_0 = const()[name = string("op_4792_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4792_strides_0 = const()[name = string("op_4792_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4792_pad_0 = const()[name = string("op_4792_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4792_dilations_0 = const()[name = string("op_4792_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4792_groups_0 = const()[name = string("op_4792_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_11_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(182636608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(185782400))))[name = string("layers_11_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_4792_cast_fp16 = conv(dilations = var_4792_dilations_0, groups = var_4792_groups_0, pad = var_4792_pad_0, pad_type = var_4792_pad_type_0, strides = var_4792_strides_0, weight = layers_11_mlp_up_proj_weight_to_fp16_palettized, x = input_91_cast_fp16)[name = string("op_4792_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_95_cast_fp16 = mul(x = var_4786_cast_fp16, y = var_4792_cast_fp16)[name = string("input_95_cast_fp16")];
+            string hidden_states_119_pad_type_0 = const()[name = string("hidden_states_119_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_119_strides_0 = const()[name = string("hidden_states_119_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_119_pad_0 = const()[name = string("hidden_states_119_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_119_dilations_0 = const()[name = string("hidden_states_119_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_119_groups_0 = const()[name = string("hidden_states_119_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_11_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(185782976))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188928768))))[name = string("layers_11_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_119_cast_fp16 = conv(dilations = hidden_states_119_dilations_0, groups = hidden_states_119_groups_0, pad = hidden_states_119_pad_0, pad_type = hidden_states_119_pad_type_0, strides = hidden_states_119_strides_0, weight = layers_11_mlp_down_proj_weight_to_fp16_palettized, x = input_95_cast_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_95_cast_fp16 = add(x = inputs_93_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("inputs_95_cast_fp16")];
+            int32 var_4806 = const()[name = string("op_4806"), val = int32(3)];
+            int32 var_4816 = const()[name = string("op_4816"), val = int32(-2)];
+            int32 var_4824 = const()[name = string("op_4824"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_97_cast_fp16 = mul(x = inputs_95_cast_fp16, y = inputs_95_cast_fp16)[name = string("inputs_sq_97_cast_fp16")];
+            tensor<int32, [1]> variance_97_axes_0 = const()[name = string("variance_97_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_97_keep_dims_0 = const()[name = string("variance_97_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_97_cast_fp16 = reduce_mean(axes = variance_97_axes_0, keep_dims = variance_97_keep_dims_0, x = inputs_sq_97_cast_fp16)[name = string("variance_97_cast_fp16")];
+            fp16 var_4836_to_fp16 = const()[name = string("op_4836_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4837_cast_fp16 = add(x = variance_97_cast_fp16, y = var_4836_to_fp16)[name = string("op_4837_cast_fp16")];
+            fp32 var_4838_epsilon_0 = const()[name = string("op_4838_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4838_cast_fp16 = rsqrt(epsilon = var_4838_epsilon_0, x = var_4837_cast_fp16)[name = string("op_4838_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_121_cast_fp16 = mul(x = inputs_95_cast_fp16, y = var_4838_cast_fp16)[name = string("hidden_states_121_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_97_to_fp16 = const()[name = string("w_97_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188929344)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_101_cast_fp16 = mul(x = w_97_to_fp16, y = hidden_states_121_cast_fp16)[name = string("obj_101_cast_fp16")];
+            string query_73_pad_type_0 = const()[name = string("query_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_73_strides_0 = const()[name = string("query_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_73_pad_0 = const()[name = string("query_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_73_dilations_0 = const()[name = string("query_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_73_groups_0 = const()[name = string("query_73_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_12_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188931456))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191028672))))[name = string("layers_12_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_73_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_73_dilations_0, groups = query_73_groups_0, pad = query_73_pad_0, pad_type = query_73_pad_type_0, strides = query_73_strides_0, weight = layers_12_self_attn_q_proj_weight_to_fp16_palettized, x = obj_101_cast_fp16)[name = string("query_73_cast_fp16")];
+            string current_key_49_pad_type_0 = const()[name = string("current_key_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_49_strides_0 = const()[name = string("current_key_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_49_pad_0 = const()[name = string("current_key_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_49_dilations_0 = const()[name = string("current_key_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_49_groups_0 = const()[name = string("current_key_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191029248))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192077888))))[name = string("layers_12_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_49_cast_fp16 = conv(dilations = current_key_49_dilations_0, groups = current_key_49_groups_0, pad = current_key_49_pad_0, pad_type = current_key_49_pad_type_0, strides = current_key_49_strides_0, weight = layers_12_self_attn_k_proj_weight_to_fp16_palettized, x = obj_101_cast_fp16)[name = string("current_key_49_cast_fp16")];
+            string current_value_25_pad_type_0 = const()[name = string("current_value_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_25_strides_0 = const()[name = string("current_value_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_25_pad_0 = const()[name = string("current_value_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_25_dilations_0 = const()[name = string("current_value_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_25_groups_0 = const()[name = string("current_value_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192078464))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(193127104))))[name = string("layers_12_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_25_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_25_dilations_0, groups = current_value_25_groups_0, pad = current_value_25_pad_0, pad_type = current_value_25_pad_type_0, strides = current_value_25_strides_0, weight = layers_12_self_attn_v_proj_weight_to_fp16_palettized, x = obj_101_cast_fp16)[name = string("current_value_25_cast_fp16")];
+            tensor<int32, [4]> var_4875 = const()[name = string("op_4875"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_97_cast_fp16 = reshape(shape = var_4875, x = query_73_cast_fp16)[name = string("inputs_97_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_99_cast_fp16 = mul(x = inputs_97_cast_fp16, y = inputs_97_cast_fp16)[name = string("inputs_sq_99_cast_fp16")];
+            tensor<int32, [1]> variance_99_axes_0 = const()[name = string("variance_99_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_99_keep_dims_0 = const()[name = string("variance_99_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_99_cast_fp16 = reduce_mean(axes = variance_99_axes_0, keep_dims = variance_99_keep_dims_0, x = inputs_sq_99_cast_fp16)[name = string("variance_99_cast_fp16")];
+            fp16 var_4881_to_fp16 = const()[name = string("op_4881_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_4882_cast_fp16 = add(x = variance_99_cast_fp16, y = var_4881_to_fp16)[name = string("op_4882_cast_fp16")];
+            fp32 var_4883_epsilon_0 = const()[name = string("op_4883_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_4883_cast_fp16 = rsqrt(epsilon = var_4883_epsilon_0, x = var_4882_cast_fp16)[name = string("op_4883_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_123_cast_fp16 = mul(x = inputs_97_cast_fp16, y = var_4883_cast_fp16)[name = string("hidden_states_123_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_99_to_fp16 = const()[name = string("w_99_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(193127680)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_25_cast_fp16 = mul(x = w_99_to_fp16, y = hidden_states_123_cast_fp16)[name = string("query_normed_25_cast_fp16")];
+            tensor<int32, [4]> var_4891 = const()[name = string("op_4891"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_99_cast_fp16 = reshape(shape = var_4891, x = current_key_49_cast_fp16)[name = string("inputs_99_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_101_cast_fp16 = mul(x = inputs_99_cast_fp16, y = inputs_99_cast_fp16)[name = string("inputs_sq_101_cast_fp16")];
+            tensor<int32, [1]> variance_101_axes_0 = const()[name = string("variance_101_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_101_keep_dims_0 = const()[name = string("variance_101_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_101_cast_fp16 = reduce_mean(axes = variance_101_axes_0, keep_dims = variance_101_keep_dims_0, x = inputs_sq_101_cast_fp16)[name = string("variance_101_cast_fp16")];
+            fp16 var_4897_to_fp16 = const()[name = string("op_4897_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_4898_cast_fp16 = add(x = variance_101_cast_fp16, y = var_4897_to_fp16)[name = string("op_4898_cast_fp16")];
+            fp32 var_4899_epsilon_0 = const()[name = string("op_4899_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_4899_cast_fp16 = rsqrt(epsilon = var_4899_epsilon_0, x = var_4898_cast_fp16)[name = string("op_4899_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_125_cast_fp16 = mul(x = inputs_99_cast_fp16, y = var_4899_cast_fp16)[name = string("hidden_states_125_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_101_to_fp16 = const()[name = string("w_101_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(193128000)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_25_cast_fp16 = mul(x = w_101_to_fp16, y = hidden_states_125_cast_fp16)[name = string("current_key_normed_25_cast_fp16")];
+            tensor<int32, [4]> var_4917 = const()[name = string("op_4917"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_73_cast_fp16 = reshape(shape = var_4917, x = query_normed_25_cast_fp16)[name = string("mh_q_73_cast_fp16")];
+            tensor<int32, [4]> var_4919 = const()[name = string("op_4919"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_49_cast_fp16 = reshape(shape = var_4919, x = current_key_normed_25_cast_fp16)[name = string("mh_k_49_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4923_cast_fp16 = mul(x = mh_q_73_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4923_cast_fp16")];
+            tensor<int32, [4]> var_4928_begin_0 = const()[name = string("op_4928_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4928_end_0 = const()[name = string("op_4928_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_4928_end_mask_0 = const()[name = string("op_4928_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4928_cast_fp16 = slice_by_index(begin = var_4928_begin_0, end = var_4928_end_0, end_mask = var_4928_end_mask_0, x = mh_q_73_cast_fp16)[name = string("op_4928_cast_fp16")];
+            tensor<int32, [4]> var_4934_begin_0 = const()[name = string("op_4934_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4934_end_0 = const()[name = string("op_4934_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_4934_end_mask_0 = const()[name = string("op_4934_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4934_cast_fp16 = slice_by_index(begin = var_4934_begin_0, end = var_4934_end_0, end_mask = var_4934_end_mask_0, x = mh_q_73_cast_fp16)[name = string("op_4934_cast_fp16")];
+            fp16 const_293_promoted_to_fp16 = const()[name = string("const_293_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_4936_cast_fp16 = mul(x = var_4934_cast_fp16, y = const_293_promoted_to_fp16)[name = string("op_4936_cast_fp16")];
+            bool var_4938_interleave_0 = const()[name = string("op_4938_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_4938_cast_fp16 = concat(axis = var_4816, interleave = var_4938_interleave_0, values = (var_4936_cast_fp16, var_4928_cast_fp16))[name = string("op_4938_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4939_cast_fp16 = mul(x = var_4938_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4939_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_75_cast_fp16 = add(x = var_4923_cast_fp16, y = var_4939_cast_fp16)[name = string("mh_q_75_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4941_cast_fp16 = mul(x = mh_k_49_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4941_cast_fp16")];
+            tensor<int32, [4]> var_4946_begin_0 = const()[name = string("op_4946_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4946_end_0 = const()[name = string("op_4946_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_4946_end_mask_0 = const()[name = string("op_4946_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4946_cast_fp16 = slice_by_index(begin = var_4946_begin_0, end = var_4946_end_0, end_mask = var_4946_end_mask_0, x = mh_k_49_cast_fp16)[name = string("op_4946_cast_fp16")];
+            tensor<int32, [4]> var_4952_begin_0 = const()[name = string("op_4952_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4952_end_0 = const()[name = string("op_4952_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_4952_end_mask_0 = const()[name = string("op_4952_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4952_cast_fp16 = slice_by_index(begin = var_4952_begin_0, end = var_4952_end_0, end_mask = var_4952_end_mask_0, x = mh_k_49_cast_fp16)[name = string("op_4952_cast_fp16")];
+            fp16 const_296_promoted_to_fp16 = const()[name = string("const_296_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_4954_cast_fp16 = mul(x = var_4952_cast_fp16, y = const_296_promoted_to_fp16)[name = string("op_4954_cast_fp16")];
+            bool var_4956_interleave_0 = const()[name = string("op_4956_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_4956_cast_fp16 = concat(axis = var_4816, interleave = var_4956_interleave_0, values = (var_4954_cast_fp16, var_4946_cast_fp16))[name = string("op_4956_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4957_cast_fp16 = mul(x = var_4956_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4957_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_51_cast_fp16 = add(x = var_4941_cast_fp16, y = var_4957_cast_fp16)[name = string("mh_k_51_cast_fp16")];
+            tensor<int32, [4]> var_4961 = const()[name = string("op_4961"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_51_cast_fp16 = reshape(shape = var_4961, x = mh_k_51_cast_fp16)[name = string("current_key_51_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4968_cast_fp16 = mul(x = var_101_cast_fp16_12, y = var_323_cast_fp16)[name = string("op_4968_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4969_cast_fp16 = mul(x = current_key_51_cast_fp16, y = var_321_cast_fp16)[name = string("op_4969_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_75_cast_fp16 = add(x = var_4968_cast_fp16, y = var_4969_cast_fp16)[name = string("key_75_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4972_cast_fp16 = mul(x = var_132_cast_fp16_12, y = var_323_cast_fp16)[name = string("op_4972_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4973_cast_fp16 = mul(x = current_value_25_cast_fp16, y = var_321_cast_fp16)[name = string("op_4973_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_49_cast_fp16 = add(x = var_4972_cast_fp16, y = var_4973_cast_fp16)[name = string("value_49_cast_fp16")];
+            tensor<int32, [4]> var_4977 = const()[name = string("op_4977"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_49_cast_fp16 = reshape(shape = var_4977, x = key_75_cast_fp16)[name = string("key_heads_49_cast_fp16")];
+            tensor<int32, [4]> var_4979 = const()[name = string("op_4979"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_49_cast_fp16 = reshape(shape = var_4979, x = value_49_cast_fp16)[name = string("value_heads_49_cast_fp16")];
+            tensor<int32, [4]> var_4982_begin_0 = const()[name = string("op_4982_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4982_end_0 = const()[name = string("op_4982_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4982_end_mask_0 = const()[name = string("op_4982_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4982_cast_fp16 = slice_by_index(begin = var_4982_begin_0, end = var_4982_end_0, end_mask = var_4982_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_4982_cast_fp16")];
+            tensor<int32, [4]> var_4986_begin_0 = const()[name = string("op_4986_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4986_end_0 = const()[name = string("op_4986_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4986_end_mask_0 = const()[name = string("op_4986_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4986_cast_fp16 = slice_by_index(begin = var_4986_begin_0, end = var_4986_end_0, end_mask = var_4986_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_4986_cast_fp16")];
+            tensor<int32, [4]> var_4998_begin_0 = const()[name = string("op_4998_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4998_end_0 = const()[name = string("op_4998_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4998_end_mask_0 = const()[name = string("op_4998_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4998_cast_fp16 = slice_by_index(begin = var_4998_begin_0, end = var_4998_end_0, end_mask = var_4998_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_4998_cast_fp16")];
+            tensor<int32, [4]> var_5002_begin_0 = const()[name = string("op_5002_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5002_end_0 = const()[name = string("op_5002_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5002_end_mask_0 = const()[name = string("op_5002_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5002_cast_fp16 = slice_by_index(begin = var_5002_begin_0, end = var_5002_end_0, end_mask = var_5002_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5002_cast_fp16")];
+            tensor<int32, [4]> var_5014_begin_0 = const()[name = string("op_5014_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5014_end_0 = const()[name = string("op_5014_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5014_end_mask_0 = const()[name = string("op_5014_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5014_cast_fp16 = slice_by_index(begin = var_5014_begin_0, end = var_5014_end_0, end_mask = var_5014_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5014_cast_fp16")];
+            tensor<int32, [4]> var_5018_begin_0 = const()[name = string("op_5018_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5018_end_0 = const()[name = string("op_5018_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5018_end_mask_0 = const()[name = string("op_5018_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5018_cast_fp16 = slice_by_index(begin = var_5018_begin_0, end = var_5018_end_0, end_mask = var_5018_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5018_cast_fp16")];
+            tensor<int32, [4]> var_5030_begin_0 = const()[name = string("op_5030_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5030_end_0 = const()[name = string("op_5030_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5030_end_mask_0 = const()[name = string("op_5030_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5030_cast_fp16 = slice_by_index(begin = var_5030_begin_0, end = var_5030_end_0, end_mask = var_5030_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5030_cast_fp16")];
+            tensor<int32, [4]> var_5034_begin_0 = const()[name = string("op_5034_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5034_end_0 = const()[name = string("op_5034_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5034_end_mask_0 = const()[name = string("op_5034_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5034_cast_fp16 = slice_by_index(begin = var_5034_begin_0, end = var_5034_end_0, end_mask = var_5034_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5034_cast_fp16")];
+            tensor<int32, [4]> var_5046_begin_0 = const()[name = string("op_5046_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5046_end_0 = const()[name = string("op_5046_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5046_end_mask_0 = const()[name = string("op_5046_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5046_cast_fp16 = slice_by_index(begin = var_5046_begin_0, end = var_5046_end_0, end_mask = var_5046_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5046_cast_fp16")];
+            tensor<int32, [4]> var_5050_begin_0 = const()[name = string("op_5050_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5050_end_0 = const()[name = string("op_5050_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5050_end_mask_0 = const()[name = string("op_5050_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5050_cast_fp16 = slice_by_index(begin = var_5050_begin_0, end = var_5050_end_0, end_mask = var_5050_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5050_cast_fp16")];
+            tensor<int32, [4]> var_5062_begin_0 = const()[name = string("op_5062_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5062_end_0 = const()[name = string("op_5062_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5062_end_mask_0 = const()[name = string("op_5062_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5062_cast_fp16 = slice_by_index(begin = var_5062_begin_0, end = var_5062_end_0, end_mask = var_5062_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5062_cast_fp16")];
+            tensor<int32, [4]> var_5066_begin_0 = const()[name = string("op_5066_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5066_end_0 = const()[name = string("op_5066_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5066_end_mask_0 = const()[name = string("op_5066_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5066_cast_fp16 = slice_by_index(begin = var_5066_begin_0, end = var_5066_end_0, end_mask = var_5066_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5066_cast_fp16")];
+            tensor<int32, [4]> var_5078_begin_0 = const()[name = string("op_5078_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5078_end_0 = const()[name = string("op_5078_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5078_end_mask_0 = const()[name = string("op_5078_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5078_cast_fp16 = slice_by_index(begin = var_5078_begin_0, end = var_5078_end_0, end_mask = var_5078_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5078_cast_fp16")];
+            tensor<int32, [4]> var_5082_begin_0 = const()[name = string("op_5082_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5082_end_0 = const()[name = string("op_5082_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5082_end_mask_0 = const()[name = string("op_5082_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5082_cast_fp16 = slice_by_index(begin = var_5082_begin_0, end = var_5082_end_0, end_mask = var_5082_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5082_cast_fp16")];
+            tensor<int32, [4]> var_5094_begin_0 = const()[name = string("op_5094_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5094_end_0 = const()[name = string("op_5094_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5094_end_mask_0 = const()[name = string("op_5094_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5094_cast_fp16 = slice_by_index(begin = var_5094_begin_0, end = var_5094_end_0, end_mask = var_5094_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5094_cast_fp16")];
+            tensor<int32, [4]> var_5098_begin_0 = const()[name = string("op_5098_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5098_end_0 = const()[name = string("op_5098_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5098_end_mask_0 = const()[name = string("op_5098_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5098_cast_fp16 = slice_by_index(begin = var_5098_begin_0, end = var_5098_end_0, end_mask = var_5098_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5098_cast_fp16")];
+            bool key_heads_51_interleave_0 = const()[name = string("key_heads_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_51_cast_fp16 = concat(axis = var_4824, interleave = key_heads_51_interleave_0, values = (var_4982_cast_fp16, var_4982_cast_fp16, var_4998_cast_fp16, var_4998_cast_fp16, var_5014_cast_fp16, var_5014_cast_fp16, var_5030_cast_fp16, var_5030_cast_fp16, var_5046_cast_fp16, var_5046_cast_fp16, var_5062_cast_fp16, var_5062_cast_fp16, var_5078_cast_fp16, var_5078_cast_fp16, var_5094_cast_fp16, var_5094_cast_fp16))[name = string("key_heads_51_cast_fp16")];
+            bool value_heads_51_interleave_0 = const()[name = string("value_heads_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_51_cast_fp16 = concat(axis = var_4824, interleave = value_heads_51_interleave_0, values = (var_4986_cast_fp16, var_4986_cast_fp16, var_5002_cast_fp16, var_5002_cast_fp16, var_5018_cast_fp16, var_5018_cast_fp16, var_5034_cast_fp16, var_5034_cast_fp16, var_5050_cast_fp16, var_5050_cast_fp16, var_5066_cast_fp16, var_5066_cast_fp16, var_5082_cast_fp16, var_5082_cast_fp16, var_5098_cast_fp16, var_5098_cast_fp16))[name = string("value_heads_51_cast_fp16")];
+            fp16 var_5121_to_fp16 = const()[name = string("op_5121_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_5122_cast_fp16 = mul(x = mh_q_75_cast_fp16, y = var_5121_to_fp16)[name = string("op_5122_cast_fp16")];
+            bool mh_w_49_transpose_x_0 = const()[name = string("mh_w_49_transpose_x_0"), val = bool(true)];
+            bool mh_w_49_transpose_y_0 = const()[name = string("mh_w_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_49_cast_fp16 = matmul(transpose_x = mh_w_49_transpose_x_0, transpose_y = mh_w_49_transpose_y_0, x = var_5122_cast_fp16, y = key_heads_51_cast_fp16)[name = string("mh_w_49_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_51_cast_fp16 = add(x = mh_w_49_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_51_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_5134_cast_fp16 = softmax(axis = var_4806, x = mh_w_51_cast_fp16)[name = string("op_5134_cast_fp16")];
+            bool attn_25_transpose_x_0 = const()[name = string("attn_25_transpose_x_0"), val = bool(false)];
+            bool attn_25_transpose_y_0 = const()[name = string("attn_25_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_25_cast_fp16 = matmul(transpose_x = attn_25_transpose_x_0, transpose_y = attn_25_transpose_y_0, x = value_heads_51_cast_fp16, y = var_5134_cast_fp16)[name = string("attn_25_cast_fp16")];
+            tensor<int32, [4]> var_5139 = const()[name = string("op_5139"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_97_cast_fp16 = reshape(shape = var_5139, x = attn_25_cast_fp16)[name = string("input_97_cast_fp16")];
+            string obj_107_pad_type_0 = const()[name = string("obj_107_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_107_strides_0 = const()[name = string("obj_107_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_107_pad_0 = const()[name = string("obj_107_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_107_dilations_0 = const()[name = string("obj_107_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_107_groups_0 = const()[name = string("obj_107_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_12_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(193128320))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(195225536))))[name = string("layers_12_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_107_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_107_dilations_0, groups = obj_107_groups_0, pad = obj_107_pad_0, pad_type = obj_107_pad_type_0, strides = obj_107_strides_0, weight = layers_12_self_attn_o_proj_weight_to_fp16_palettized, x = input_97_cast_fp16)[name = string("obj_107_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_101_cast_fp16 = add(x = inputs_95_cast_fp16, y = obj_107_cast_fp16)[name = string("inputs_101_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_103_cast_fp16 = mul(x = inputs_101_cast_fp16, y = inputs_101_cast_fp16)[name = string("inputs_sq_103_cast_fp16")];
+            tensor<int32, [1]> variance_103_axes_0 = const()[name = string("variance_103_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_103_keep_dims_0 = const()[name = string("variance_103_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_103_cast_fp16 = reduce_mean(axes = variance_103_axes_0, keep_dims = variance_103_keep_dims_0, x = inputs_sq_103_cast_fp16)[name = string("variance_103_cast_fp16")];
+            fp16 var_5157_to_fp16 = const()[name = string("op_5157_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5158_cast_fp16 = add(x = variance_103_cast_fp16, y = var_5157_to_fp16)[name = string("op_5158_cast_fp16")];
+            fp32 var_5159_epsilon_0 = const()[name = string("op_5159_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5159_cast_fp16 = rsqrt(epsilon = var_5159_epsilon_0, x = var_5158_cast_fp16)[name = string("op_5159_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_127_cast_fp16 = mul(x = inputs_101_cast_fp16, y = var_5159_cast_fp16)[name = string("hidden_states_127_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_103_to_fp16 = const()[name = string("w_103_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(195226112)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_99_cast_fp16 = mul(x = w_103_to_fp16, y = hidden_states_127_cast_fp16)[name = string("input_99_cast_fp16")];
+            string input_101_pad_type_0 = const()[name = string("input_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_101_strides_0 = const()[name = string("input_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_101_pad_0 = const()[name = string("input_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_101_dilations_0 = const()[name = string("input_101_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_101_groups_0 = const()[name = string("input_101_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_12_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(195228224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(198374016))))[name = string("layers_12_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_101_cast_fp16 = conv(dilations = input_101_dilations_0, groups = input_101_groups_0, pad = input_101_pad_0, pad_type = input_101_pad_type_0, strides = input_101_strides_0, weight = layers_12_mlp_gate_proj_weight_to_fp16_palettized, x = input_99_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_5173_cast_fp16 = silu(x = input_101_cast_fp16)[name = string("op_5173_cast_fp16")];
+            string var_5179_pad_type_0 = const()[name = string("op_5179_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5179_strides_0 = const()[name = string("op_5179_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5179_pad_0 = const()[name = string("op_5179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5179_dilations_0 = const()[name = string("op_5179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5179_groups_0 = const()[name = string("op_5179_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_12_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(198374592))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(201520384))))[name = string("layers_12_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_5179_cast_fp16 = conv(dilations = var_5179_dilations_0, groups = var_5179_groups_0, pad = var_5179_pad_0, pad_type = var_5179_pad_type_0, strides = var_5179_strides_0, weight = layers_12_mlp_up_proj_weight_to_fp16_palettized, x = input_99_cast_fp16)[name = string("op_5179_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_103_cast_fp16 = mul(x = var_5173_cast_fp16, y = var_5179_cast_fp16)[name = string("input_103_cast_fp16")];
+            string hidden_states_129_pad_type_0 = const()[name = string("hidden_states_129_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_129_strides_0 = const()[name = string("hidden_states_129_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_129_pad_0 = const()[name = string("hidden_states_129_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_129_dilations_0 = const()[name = string("hidden_states_129_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_129_groups_0 = const()[name = string("hidden_states_129_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_12_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(201520960))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204666752))))[name = string("layers_12_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_129_cast_fp16 = conv(dilations = hidden_states_129_dilations_0, groups = hidden_states_129_groups_0, pad = hidden_states_129_pad_0, pad_type = hidden_states_129_pad_type_0, strides = hidden_states_129_strides_0, weight = layers_12_mlp_down_proj_weight_to_fp16_palettized, x = input_103_cast_fp16)[name = string("hidden_states_129_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_103_cast_fp16 = add(x = inputs_101_cast_fp16, y = hidden_states_129_cast_fp16)[name = string("inputs_103_cast_fp16")];
+            int32 var_5193 = const()[name = string("op_5193"), val = int32(3)];
+            int32 var_5203 = const()[name = string("op_5203"), val = int32(-2)];
+            int32 var_5211 = const()[name = string("op_5211"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_105_cast_fp16 = mul(x = inputs_103_cast_fp16, y = inputs_103_cast_fp16)[name = string("inputs_sq_105_cast_fp16")];
+            tensor<int32, [1]> variance_105_axes_0 = const()[name = string("variance_105_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_105_keep_dims_0 = const()[name = string("variance_105_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_105_cast_fp16 = reduce_mean(axes = variance_105_axes_0, keep_dims = variance_105_keep_dims_0, x = inputs_sq_105_cast_fp16)[name = string("variance_105_cast_fp16")];
+            fp16 var_5223_to_fp16 = const()[name = string("op_5223_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5224_cast_fp16 = add(x = variance_105_cast_fp16, y = var_5223_to_fp16)[name = string("op_5224_cast_fp16")];
+            fp32 var_5225_epsilon_0 = const()[name = string("op_5225_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5225_cast_fp16 = rsqrt(epsilon = var_5225_epsilon_0, x = var_5224_cast_fp16)[name = string("op_5225_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_131_cast_fp16 = mul(x = inputs_103_cast_fp16, y = var_5225_cast_fp16)[name = string("hidden_states_131_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_105_to_fp16 = const()[name = string("w_105_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204667328)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_109_cast_fp16 = mul(x = w_105_to_fp16, y = hidden_states_131_cast_fp16)[name = string("obj_109_cast_fp16")];
+            string query_79_pad_type_0 = const()[name = string("query_79_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_79_strides_0 = const()[name = string("query_79_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_79_pad_0 = const()[name = string("query_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_79_dilations_0 = const()[name = string("query_79_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_79_groups_0 = const()[name = string("query_79_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_13_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204669440))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(206766656))))[name = string("layers_13_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_79_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_79_dilations_0, groups = query_79_groups_0, pad = query_79_pad_0, pad_type = query_79_pad_type_0, strides = query_79_strides_0, weight = layers_13_self_attn_q_proj_weight_to_fp16_palettized, x = obj_109_cast_fp16)[name = string("query_79_cast_fp16")];
+            string current_key_53_pad_type_0 = const()[name = string("current_key_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_53_strides_0 = const()[name = string("current_key_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_53_pad_0 = const()[name = string("current_key_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_53_dilations_0 = const()[name = string("current_key_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_53_groups_0 = const()[name = string("current_key_53_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(206767232))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(207815872))))[name = string("layers_13_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_53_cast_fp16 = conv(dilations = current_key_53_dilations_0, groups = current_key_53_groups_0, pad = current_key_53_pad_0, pad_type = current_key_53_pad_type_0, strides = current_key_53_strides_0, weight = layers_13_self_attn_k_proj_weight_to_fp16_palettized, x = obj_109_cast_fp16)[name = string("current_key_53_cast_fp16")];
+            string current_value_27_pad_type_0 = const()[name = string("current_value_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_27_strides_0 = const()[name = string("current_value_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_27_pad_0 = const()[name = string("current_value_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_27_dilations_0 = const()[name = string("current_value_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_27_groups_0 = const()[name = string("current_value_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(207816448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(208865088))))[name = string("layers_13_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_27_dilations_0, groups = current_value_27_groups_0, pad = current_value_27_pad_0, pad_type = current_value_27_pad_type_0, strides = current_value_27_strides_0, weight = layers_13_self_attn_v_proj_weight_to_fp16_palettized, x = obj_109_cast_fp16)[name = string("current_value_27_cast_fp16")];
+            tensor<int32, [4]> var_5262 = const()[name = string("op_5262"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_105_cast_fp16 = reshape(shape = var_5262, x = query_79_cast_fp16)[name = string("inputs_105_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_107_cast_fp16 = mul(x = inputs_105_cast_fp16, y = inputs_105_cast_fp16)[name = string("inputs_sq_107_cast_fp16")];
+            tensor<int32, [1]> variance_107_axes_0 = const()[name = string("variance_107_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_107_keep_dims_0 = const()[name = string("variance_107_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_107_cast_fp16 = reduce_mean(axes = variance_107_axes_0, keep_dims = variance_107_keep_dims_0, x = inputs_sq_107_cast_fp16)[name = string("variance_107_cast_fp16")];
+            fp16 var_5268_to_fp16 = const()[name = string("op_5268_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_5269_cast_fp16 = add(x = variance_107_cast_fp16, y = var_5268_to_fp16)[name = string("op_5269_cast_fp16")];
+            fp32 var_5270_epsilon_0 = const()[name = string("op_5270_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_5270_cast_fp16 = rsqrt(epsilon = var_5270_epsilon_0, x = var_5269_cast_fp16)[name = string("op_5270_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_133_cast_fp16 = mul(x = inputs_105_cast_fp16, y = var_5270_cast_fp16)[name = string("hidden_states_133_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_107_to_fp16 = const()[name = string("w_107_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(208865664)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_27_cast_fp16 = mul(x = w_107_to_fp16, y = hidden_states_133_cast_fp16)[name = string("query_normed_27_cast_fp16")];
+            tensor<int32, [4]> var_5278 = const()[name = string("op_5278"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_107_cast_fp16 = reshape(shape = var_5278, x = current_key_53_cast_fp16)[name = string("inputs_107_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_109_cast_fp16 = mul(x = inputs_107_cast_fp16, y = inputs_107_cast_fp16)[name = string("inputs_sq_109_cast_fp16")];
+            tensor<int32, [1]> variance_109_axes_0 = const()[name = string("variance_109_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_109_keep_dims_0 = const()[name = string("variance_109_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_109_cast_fp16 = reduce_mean(axes = variance_109_axes_0, keep_dims = variance_109_keep_dims_0, x = inputs_sq_109_cast_fp16)[name = string("variance_109_cast_fp16")];
+            fp16 var_5284_to_fp16 = const()[name = string("op_5284_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_5285_cast_fp16 = add(x = variance_109_cast_fp16, y = var_5284_to_fp16)[name = string("op_5285_cast_fp16")];
+            fp32 var_5286_epsilon_0 = const()[name = string("op_5286_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_5286_cast_fp16 = rsqrt(epsilon = var_5286_epsilon_0, x = var_5285_cast_fp16)[name = string("op_5286_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_135_cast_fp16 = mul(x = inputs_107_cast_fp16, y = var_5286_cast_fp16)[name = string("hidden_states_135_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_109_to_fp16 = const()[name = string("w_109_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(208865984)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_27_cast_fp16 = mul(x = w_109_to_fp16, y = hidden_states_135_cast_fp16)[name = string("current_key_normed_27_cast_fp16")];
+            tensor<int32, [4]> var_5304 = const()[name = string("op_5304"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_79_cast_fp16 = reshape(shape = var_5304, x = query_normed_27_cast_fp16)[name = string("mh_q_79_cast_fp16")];
+            tensor<int32, [4]> var_5306 = const()[name = string("op_5306"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_53_cast_fp16 = reshape(shape = var_5306, x = current_key_normed_27_cast_fp16)[name = string("mh_k_53_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5310_cast_fp16 = mul(x = mh_q_79_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5310_cast_fp16")];
+            tensor<int32, [4]> var_5315_begin_0 = const()[name = string("op_5315_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5315_end_0 = const()[name = string("op_5315_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_5315_end_mask_0 = const()[name = string("op_5315_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5315_cast_fp16 = slice_by_index(begin = var_5315_begin_0, end = var_5315_end_0, end_mask = var_5315_end_mask_0, x = mh_q_79_cast_fp16)[name = string("op_5315_cast_fp16")];
+            tensor<int32, [4]> var_5321_begin_0 = const()[name = string("op_5321_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5321_end_0 = const()[name = string("op_5321_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_5321_end_mask_0 = const()[name = string("op_5321_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5321_cast_fp16 = slice_by_index(begin = var_5321_begin_0, end = var_5321_end_0, end_mask = var_5321_end_mask_0, x = mh_q_79_cast_fp16)[name = string("op_5321_cast_fp16")];
+            fp16 const_316_promoted_to_fp16 = const()[name = string("const_316_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_5323_cast_fp16 = mul(x = var_5321_cast_fp16, y = const_316_promoted_to_fp16)[name = string("op_5323_cast_fp16")];
+            bool var_5325_interleave_0 = const()[name = string("op_5325_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_5325_cast_fp16 = concat(axis = var_5203, interleave = var_5325_interleave_0, values = (var_5323_cast_fp16, var_5315_cast_fp16))[name = string("op_5325_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5326_cast_fp16 = mul(x = var_5325_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5326_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_81_cast_fp16 = add(x = var_5310_cast_fp16, y = var_5326_cast_fp16)[name = string("mh_q_81_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5328_cast_fp16 = mul(x = mh_k_53_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5328_cast_fp16")];
+            tensor<int32, [4]> var_5333_begin_0 = const()[name = string("op_5333_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5333_end_0 = const()[name = string("op_5333_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_5333_end_mask_0 = const()[name = string("op_5333_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5333_cast_fp16 = slice_by_index(begin = var_5333_begin_0, end = var_5333_end_0, end_mask = var_5333_end_mask_0, x = mh_k_53_cast_fp16)[name = string("op_5333_cast_fp16")];
+            tensor<int32, [4]> var_5339_begin_0 = const()[name = string("op_5339_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5339_end_0 = const()[name = string("op_5339_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_5339_end_mask_0 = const()[name = string("op_5339_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5339_cast_fp16 = slice_by_index(begin = var_5339_begin_0, end = var_5339_end_0, end_mask = var_5339_end_mask_0, x = mh_k_53_cast_fp16)[name = string("op_5339_cast_fp16")];
+            fp16 const_319_promoted_to_fp16 = const()[name = string("const_319_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_5341_cast_fp16 = mul(x = var_5339_cast_fp16, y = const_319_promoted_to_fp16)[name = string("op_5341_cast_fp16")];
+            bool var_5343_interleave_0 = const()[name = string("op_5343_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_5343_cast_fp16 = concat(axis = var_5203, interleave = var_5343_interleave_0, values = (var_5341_cast_fp16, var_5333_cast_fp16))[name = string("op_5343_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5344_cast_fp16 = mul(x = var_5343_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5344_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_55_cast_fp16 = add(x = var_5328_cast_fp16, y = var_5344_cast_fp16)[name = string("mh_k_55_cast_fp16")];
+            tensor<int32, [4]> var_5348 = const()[name = string("op_5348"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_55_cast_fp16 = reshape(shape = var_5348, x = mh_k_55_cast_fp16)[name = string("current_key_55_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5355_cast_fp16 = mul(x = var_101_cast_fp16_13, y = var_323_cast_fp16)[name = string("op_5355_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5356_cast_fp16 = mul(x = current_key_55_cast_fp16, y = var_321_cast_fp16)[name = string("op_5356_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_81_cast_fp16 = add(x = var_5355_cast_fp16, y = var_5356_cast_fp16)[name = string("key_81_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5359_cast_fp16 = mul(x = var_132_cast_fp16_13, y = var_323_cast_fp16)[name = string("op_5359_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5360_cast_fp16 = mul(x = current_value_27_cast_fp16, y = var_321_cast_fp16)[name = string("op_5360_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_53_cast_fp16 = add(x = var_5359_cast_fp16, y = var_5360_cast_fp16)[name = string("value_53_cast_fp16")];
+            tensor<int32, [4]> var_5364 = const()[name = string("op_5364"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_53_cast_fp16 = reshape(shape = var_5364, x = key_81_cast_fp16)[name = string("key_heads_53_cast_fp16")];
+            tensor<int32, [4]> var_5366 = const()[name = string("op_5366"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_53_cast_fp16 = reshape(shape = var_5366, x = value_53_cast_fp16)[name = string("value_heads_53_cast_fp16")];
+            tensor<int32, [4]> var_5369_begin_0 = const()[name = string("op_5369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5369_end_0 = const()[name = string("op_5369_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5369_end_mask_0 = const()[name = string("op_5369_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5369_cast_fp16 = slice_by_index(begin = var_5369_begin_0, end = var_5369_end_0, end_mask = var_5369_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5369_cast_fp16")];
+            tensor<int32, [4]> var_5373_begin_0 = const()[name = string("op_5373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5373_end_0 = const()[name = string("op_5373_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5373_end_mask_0 = const()[name = string("op_5373_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5373_cast_fp16 = slice_by_index(begin = var_5373_begin_0, end = var_5373_end_0, end_mask = var_5373_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5373_cast_fp16")];
+            tensor<int32, [4]> var_5385_begin_0 = const()[name = string("op_5385_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5385_end_0 = const()[name = string("op_5385_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5385_end_mask_0 = const()[name = string("op_5385_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5385_cast_fp16 = slice_by_index(begin = var_5385_begin_0, end = var_5385_end_0, end_mask = var_5385_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5385_cast_fp16")];
+            tensor<int32, [4]> var_5389_begin_0 = const()[name = string("op_5389_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5389_end_0 = const()[name = string("op_5389_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5389_end_mask_0 = const()[name = string("op_5389_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5389_cast_fp16 = slice_by_index(begin = var_5389_begin_0, end = var_5389_end_0, end_mask = var_5389_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5389_cast_fp16")];
+            tensor<int32, [4]> var_5401_begin_0 = const()[name = string("op_5401_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5401_end_0 = const()[name = string("op_5401_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5401_end_mask_0 = const()[name = string("op_5401_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5401_cast_fp16 = slice_by_index(begin = var_5401_begin_0, end = var_5401_end_0, end_mask = var_5401_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5401_cast_fp16")];
+            tensor<int32, [4]> var_5405_begin_0 = const()[name = string("op_5405_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5405_end_0 = const()[name = string("op_5405_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5405_end_mask_0 = const()[name = string("op_5405_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5405_cast_fp16 = slice_by_index(begin = var_5405_begin_0, end = var_5405_end_0, end_mask = var_5405_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5405_cast_fp16")];
+            tensor<int32, [4]> var_5417_begin_0 = const()[name = string("op_5417_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5417_end_0 = const()[name = string("op_5417_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5417_end_mask_0 = const()[name = string("op_5417_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5417_cast_fp16 = slice_by_index(begin = var_5417_begin_0, end = var_5417_end_0, end_mask = var_5417_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5417_cast_fp16")];
+            tensor<int32, [4]> var_5421_begin_0 = const()[name = string("op_5421_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5421_end_0 = const()[name = string("op_5421_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5421_end_mask_0 = const()[name = string("op_5421_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5421_cast_fp16 = slice_by_index(begin = var_5421_begin_0, end = var_5421_end_0, end_mask = var_5421_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5421_cast_fp16")];
+            tensor<int32, [4]> var_5433_begin_0 = const()[name = string("op_5433_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5433_end_0 = const()[name = string("op_5433_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5433_end_mask_0 = const()[name = string("op_5433_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5433_cast_fp16 = slice_by_index(begin = var_5433_begin_0, end = var_5433_end_0, end_mask = var_5433_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5433_cast_fp16")];
+            tensor<int32, [4]> var_5437_begin_0 = const()[name = string("op_5437_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5437_end_0 = const()[name = string("op_5437_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5437_end_mask_0 = const()[name = string("op_5437_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5437_cast_fp16 = slice_by_index(begin = var_5437_begin_0, end = var_5437_end_0, end_mask = var_5437_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5437_cast_fp16")];
+            tensor<int32, [4]> var_5449_begin_0 = const()[name = string("op_5449_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5449_end_0 = const()[name = string("op_5449_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5449_end_mask_0 = const()[name = string("op_5449_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5449_cast_fp16 = slice_by_index(begin = var_5449_begin_0, end = var_5449_end_0, end_mask = var_5449_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5449_cast_fp16")];
+            tensor<int32, [4]> var_5453_begin_0 = const()[name = string("op_5453_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5453_end_0 = const()[name = string("op_5453_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5453_end_mask_0 = const()[name = string("op_5453_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5453_cast_fp16 = slice_by_index(begin = var_5453_begin_0, end = var_5453_end_0, end_mask = var_5453_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5453_cast_fp16")];
+            tensor<int32, [4]> var_5465_begin_0 = const()[name = string("op_5465_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5465_end_0 = const()[name = string("op_5465_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5465_end_mask_0 = const()[name = string("op_5465_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5465_cast_fp16 = slice_by_index(begin = var_5465_begin_0, end = var_5465_end_0, end_mask = var_5465_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5465_cast_fp16")];
+            tensor<int32, [4]> var_5469_begin_0 = const()[name = string("op_5469_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5469_end_0 = const()[name = string("op_5469_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5469_end_mask_0 = const()[name = string("op_5469_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5469_cast_fp16 = slice_by_index(begin = var_5469_begin_0, end = var_5469_end_0, end_mask = var_5469_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5469_cast_fp16")];
+            tensor<int32, [4]> var_5481_begin_0 = const()[name = string("op_5481_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5481_end_0 = const()[name = string("op_5481_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5481_end_mask_0 = const()[name = string("op_5481_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5481_cast_fp16 = slice_by_index(begin = var_5481_begin_0, end = var_5481_end_0, end_mask = var_5481_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5481_cast_fp16")];
+            tensor<int32, [4]> var_5485_begin_0 = const()[name = string("op_5485_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5485_end_0 = const()[name = string("op_5485_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5485_end_mask_0 = const()[name = string("op_5485_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5485_cast_fp16 = slice_by_index(begin = var_5485_begin_0, end = var_5485_end_0, end_mask = var_5485_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5485_cast_fp16")];
+            bool key_heads_55_interleave_0 = const()[name = string("key_heads_55_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_55_cast_fp16 = concat(axis = var_5211, interleave = key_heads_55_interleave_0, values = (var_5369_cast_fp16, var_5369_cast_fp16, var_5385_cast_fp16, var_5385_cast_fp16, var_5401_cast_fp16, var_5401_cast_fp16, var_5417_cast_fp16, var_5417_cast_fp16, var_5433_cast_fp16, var_5433_cast_fp16, var_5449_cast_fp16, var_5449_cast_fp16, var_5465_cast_fp16, var_5465_cast_fp16, var_5481_cast_fp16, var_5481_cast_fp16))[name = string("key_heads_55_cast_fp16")];
+            bool value_heads_55_interleave_0 = const()[name = string("value_heads_55_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_55_cast_fp16 = concat(axis = var_5211, interleave = value_heads_55_interleave_0, values = (var_5373_cast_fp16, var_5373_cast_fp16, var_5389_cast_fp16, var_5389_cast_fp16, var_5405_cast_fp16, var_5405_cast_fp16, var_5421_cast_fp16, var_5421_cast_fp16, var_5437_cast_fp16, var_5437_cast_fp16, var_5453_cast_fp16, var_5453_cast_fp16, var_5469_cast_fp16, var_5469_cast_fp16, var_5485_cast_fp16, var_5485_cast_fp16))[name = string("value_heads_55_cast_fp16")];
+            fp16 var_5508_to_fp16 = const()[name = string("op_5508_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_5509_cast_fp16 = mul(x = mh_q_81_cast_fp16, y = var_5508_to_fp16)[name = string("op_5509_cast_fp16")];
+            bool mh_w_53_transpose_x_0 = const()[name = string("mh_w_53_transpose_x_0"), val = bool(true)];
+            bool mh_w_53_transpose_y_0 = const()[name = string("mh_w_53_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_53_cast_fp16 = matmul(transpose_x = mh_w_53_transpose_x_0, transpose_y = mh_w_53_transpose_y_0, x = var_5509_cast_fp16, y = key_heads_55_cast_fp16)[name = string("mh_w_53_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_55_cast_fp16 = add(x = mh_w_53_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_55_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_5521_cast_fp16 = softmax(axis = var_5193, x = mh_w_55_cast_fp16)[name = string("op_5521_cast_fp16")];
+            bool attn_27_transpose_x_0 = const()[name = string("attn_27_transpose_x_0"), val = bool(false)];
+            bool attn_27_transpose_y_0 = const()[name = string("attn_27_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_27_cast_fp16 = matmul(transpose_x = attn_27_transpose_x_0, transpose_y = attn_27_transpose_y_0, x = value_heads_55_cast_fp16, y = var_5521_cast_fp16)[name = string("attn_27_cast_fp16")];
+            tensor<int32, [4]> var_5526 = const()[name = string("op_5526"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_105_cast_fp16 = reshape(shape = var_5526, x = attn_27_cast_fp16)[name = string("input_105_cast_fp16")];
+            string obj_115_pad_type_0 = const()[name = string("obj_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_115_strides_0 = const()[name = string("obj_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_115_pad_0 = const()[name = string("obj_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_115_dilations_0 = const()[name = string("obj_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_115_groups_0 = const()[name = string("obj_115_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_13_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(208866304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210963520))))[name = string("layers_13_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_115_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_115_dilations_0, groups = obj_115_groups_0, pad = obj_115_pad_0, pad_type = obj_115_pad_type_0, strides = obj_115_strides_0, weight = layers_13_self_attn_o_proj_weight_to_fp16_palettized, x = input_105_cast_fp16)[name = string("obj_115_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_109_cast_fp16 = add(x = inputs_103_cast_fp16, y = obj_115_cast_fp16)[name = string("inputs_109_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_111_cast_fp16 = mul(x = inputs_109_cast_fp16, y = inputs_109_cast_fp16)[name = string("inputs_sq_111_cast_fp16")];
+            tensor<int32, [1]> variance_111_axes_0 = const()[name = string("variance_111_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_111_keep_dims_0 = const()[name = string("variance_111_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_111_cast_fp16 = reduce_mean(axes = variance_111_axes_0, keep_dims = variance_111_keep_dims_0, x = inputs_sq_111_cast_fp16)[name = string("variance_111_cast_fp16")];
+            fp16 var_5544_to_fp16 = const()[name = string("op_5544_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5545_cast_fp16 = add(x = variance_111_cast_fp16, y = var_5544_to_fp16)[name = string("op_5545_cast_fp16")];
+            fp32 var_5546_epsilon_0 = const()[name = string("op_5546_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5546_cast_fp16 = rsqrt(epsilon = var_5546_epsilon_0, x = var_5545_cast_fp16)[name = string("op_5546_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_137_cast_fp16 = mul(x = inputs_109_cast_fp16, y = var_5546_cast_fp16)[name = string("hidden_states_137_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_111_to_fp16 = const()[name = string("w_111_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210964096)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_107_cast_fp16 = mul(x = w_111_to_fp16, y = hidden_states_137_cast_fp16)[name = string("input_107_cast_fp16")];
+            string input_109_pad_type_0 = const()[name = string("input_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_109_strides_0 = const()[name = string("input_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_109_pad_0 = const()[name = string("input_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_109_dilations_0 = const()[name = string("input_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_109_groups_0 = const()[name = string("input_109_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_13_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210966208))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214112000))))[name = string("layers_13_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_109_cast_fp16 = conv(dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = layers_13_mlp_gate_proj_weight_to_fp16_palettized, x = input_107_cast_fp16)[name = string("input_109_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_5560_cast_fp16 = silu(x = input_109_cast_fp16)[name = string("op_5560_cast_fp16")];
+            string var_5566_pad_type_0 = const()[name = string("op_5566_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5566_strides_0 = const()[name = string("op_5566_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5566_pad_0 = const()[name = string("op_5566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5566_dilations_0 = const()[name = string("op_5566_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5566_groups_0 = const()[name = string("op_5566_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_13_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214112576))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217258368))))[name = string("layers_13_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_5566_cast_fp16 = conv(dilations = var_5566_dilations_0, groups = var_5566_groups_0, pad = var_5566_pad_0, pad_type = var_5566_pad_type_0, strides = var_5566_strides_0, weight = layers_13_mlp_up_proj_weight_to_fp16_palettized, x = input_107_cast_fp16)[name = string("op_5566_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_111_cast_fp16 = mul(x = var_5560_cast_fp16, y = var_5566_cast_fp16)[name = string("input_111_cast_fp16")];
+            string hidden_states_139_pad_type_0 = const()[name = string("hidden_states_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_139_strides_0 = const()[name = string("hidden_states_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_139_pad_0 = const()[name = string("hidden_states_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_139_dilations_0 = const()[name = string("hidden_states_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_139_groups_0 = const()[name = string("hidden_states_139_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_13_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217258944))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(220404736))))[name = string("layers_13_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_139_cast_fp16 = conv(dilations = hidden_states_139_dilations_0, groups = hidden_states_139_groups_0, pad = hidden_states_139_pad_0, pad_type = hidden_states_139_pad_type_0, strides = hidden_states_139_strides_0, weight = layers_13_mlp_down_proj_weight_to_fp16_palettized, x = input_111_cast_fp16)[name = string("hidden_states_139_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_111_cast_fp16 = add(x = inputs_109_cast_fp16, y = hidden_states_139_cast_fp16)[name = string("inputs_111_cast_fp16")];
+            int32 var_5580 = const()[name = string("op_5580"), val = int32(3)];
+            int32 var_5590 = const()[name = string("op_5590"), val = int32(-2)];
+            int32 var_5598 = const()[name = string("op_5598"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_113_cast_fp16 = mul(x = inputs_111_cast_fp16, y = inputs_111_cast_fp16)[name = string("inputs_sq_113_cast_fp16")];
+            tensor<int32, [1]> variance_113_axes_0 = const()[name = string("variance_113_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_113_keep_dims_0 = const()[name = string("variance_113_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_113_cast_fp16 = reduce_mean(axes = variance_113_axes_0, keep_dims = variance_113_keep_dims_0, x = inputs_sq_113_cast_fp16)[name = string("variance_113_cast_fp16")];
+            fp16 var_5610_to_fp16 = const()[name = string("op_5610_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5611_cast_fp16 = add(x = variance_113_cast_fp16, y = var_5610_to_fp16)[name = string("op_5611_cast_fp16")];
+            fp32 var_5612_epsilon_0 = const()[name = string("op_5612_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5612_cast_fp16 = rsqrt(epsilon = var_5612_epsilon_0, x = var_5611_cast_fp16)[name = string("op_5612_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_141_cast_fp16 = mul(x = inputs_111_cast_fp16, y = var_5612_cast_fp16)[name = string("hidden_states_141_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_113_to_fp16 = const()[name = string("w_113_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(220405312)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_117_cast_fp16 = mul(x = w_113_to_fp16, y = hidden_states_141_cast_fp16)[name = string("obj_117_cast_fp16")];
+            string query_85_pad_type_0 = const()[name = string("query_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_85_strides_0 = const()[name = string("query_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_85_pad_0 = const()[name = string("query_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_85_dilations_0 = const()[name = string("query_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_85_groups_0 = const()[name = string("query_85_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_14_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(220407424))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(222504640))))[name = string("layers_14_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_85_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_85_dilations_0, groups = query_85_groups_0, pad = query_85_pad_0, pad_type = query_85_pad_type_0, strides = query_85_strides_0, weight = layers_14_self_attn_q_proj_weight_to_fp16_palettized, x = obj_117_cast_fp16)[name = string("query_85_cast_fp16")];
+            string current_key_57_pad_type_0 = const()[name = string("current_key_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_57_strides_0 = const()[name = string("current_key_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_57_pad_0 = const()[name = string("current_key_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_57_dilations_0 = const()[name = string("current_key_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_57_groups_0 = const()[name = string("current_key_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(222505216))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223553856))))[name = string("layers_14_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_57_cast_fp16 = conv(dilations = current_key_57_dilations_0, groups = current_key_57_groups_0, pad = current_key_57_pad_0, pad_type = current_key_57_pad_type_0, strides = current_key_57_strides_0, weight = layers_14_self_attn_k_proj_weight_to_fp16_palettized, x = obj_117_cast_fp16)[name = string("current_key_57_cast_fp16")];
+            string current_value_29_pad_type_0 = const()[name = string("current_value_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_29_strides_0 = const()[name = string("current_value_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_29_pad_0 = const()[name = string("current_value_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_29_dilations_0 = const()[name = string("current_value_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_29_groups_0 = const()[name = string("current_value_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223554432))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224603072))))[name = string("layers_14_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_29_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_29_dilations_0, groups = current_value_29_groups_0, pad = current_value_29_pad_0, pad_type = current_value_29_pad_type_0, strides = current_value_29_strides_0, weight = layers_14_self_attn_v_proj_weight_to_fp16_palettized, x = obj_117_cast_fp16)[name = string("current_value_29_cast_fp16")];
+            tensor<int32, [4]> var_5649 = const()[name = string("op_5649"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_113_cast_fp16 = reshape(shape = var_5649, x = query_85_cast_fp16)[name = string("inputs_113_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_115_cast_fp16 = mul(x = inputs_113_cast_fp16, y = inputs_113_cast_fp16)[name = string("inputs_sq_115_cast_fp16")];
+            tensor<int32, [1]> variance_115_axes_0 = const()[name = string("variance_115_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_115_keep_dims_0 = const()[name = string("variance_115_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_115_cast_fp16 = reduce_mean(axes = variance_115_axes_0, keep_dims = variance_115_keep_dims_0, x = inputs_sq_115_cast_fp16)[name = string("variance_115_cast_fp16")];
+            fp16 var_5655_to_fp16 = const()[name = string("op_5655_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_5656_cast_fp16 = add(x = variance_115_cast_fp16, y = var_5655_to_fp16)[name = string("op_5656_cast_fp16")];
+            fp32 var_5657_epsilon_0 = const()[name = string("op_5657_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_5657_cast_fp16 = rsqrt(epsilon = var_5657_epsilon_0, x = var_5656_cast_fp16)[name = string("op_5657_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_143_cast_fp16 = mul(x = inputs_113_cast_fp16, y = var_5657_cast_fp16)[name = string("hidden_states_143_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_115_to_fp16 = const()[name = string("w_115_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224603648)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_29_cast_fp16 = mul(x = w_115_to_fp16, y = hidden_states_143_cast_fp16)[name = string("query_normed_29_cast_fp16")];
+            tensor<int32, [4]> var_5665 = const()[name = string("op_5665"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_115_cast_fp16 = reshape(shape = var_5665, x = current_key_57_cast_fp16)[name = string("inputs_115_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_117_cast_fp16 = mul(x = inputs_115_cast_fp16, y = inputs_115_cast_fp16)[name = string("inputs_sq_117_cast_fp16")];
+            tensor<int32, [1]> variance_117_axes_0 = const()[name = string("variance_117_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_117_keep_dims_0 = const()[name = string("variance_117_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_117_cast_fp16 = reduce_mean(axes = variance_117_axes_0, keep_dims = variance_117_keep_dims_0, x = inputs_sq_117_cast_fp16)[name = string("variance_117_cast_fp16")];
+            fp16 var_5671_to_fp16 = const()[name = string("op_5671_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_5672_cast_fp16 = add(x = variance_117_cast_fp16, y = var_5671_to_fp16)[name = string("op_5672_cast_fp16")];
+            fp32 var_5673_epsilon_0 = const()[name = string("op_5673_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_5673_cast_fp16 = rsqrt(epsilon = var_5673_epsilon_0, x = var_5672_cast_fp16)[name = string("op_5673_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_145_cast_fp16 = mul(x = inputs_115_cast_fp16, y = var_5673_cast_fp16)[name = string("hidden_states_145_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_117_to_fp16 = const()[name = string("w_117_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224603968)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_29_cast_fp16 = mul(x = w_117_to_fp16, y = hidden_states_145_cast_fp16)[name = string("current_key_normed_29_cast_fp16")];
+            tensor<int32, [4]> var_5691 = const()[name = string("op_5691"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_85_cast_fp16 = reshape(shape = var_5691, x = query_normed_29_cast_fp16)[name = string("mh_q_85_cast_fp16")];
+            tensor<int32, [4]> var_5693 = const()[name = string("op_5693"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_57_cast_fp16 = reshape(shape = var_5693, x = current_key_normed_29_cast_fp16)[name = string("mh_k_57_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5697_cast_fp16 = mul(x = mh_q_85_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5697_cast_fp16")];
+            tensor<int32, [4]> var_5702_begin_0 = const()[name = string("op_5702_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5702_end_0 = const()[name = string("op_5702_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_5702_end_mask_0 = const()[name = string("op_5702_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5702_cast_fp16 = slice_by_index(begin = var_5702_begin_0, end = var_5702_end_0, end_mask = var_5702_end_mask_0, x = mh_q_85_cast_fp16)[name = string("op_5702_cast_fp16")];
+            tensor<int32, [4]> var_5708_begin_0 = const()[name = string("op_5708_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5708_end_0 = const()[name = string("op_5708_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_5708_end_mask_0 = const()[name = string("op_5708_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5708_cast_fp16 = slice_by_index(begin = var_5708_begin_0, end = var_5708_end_0, end_mask = var_5708_end_mask_0, x = mh_q_85_cast_fp16)[name = string("op_5708_cast_fp16")];
+            fp16 const_339_promoted_to_fp16 = const()[name = string("const_339_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_5710_cast_fp16 = mul(x = var_5708_cast_fp16, y = const_339_promoted_to_fp16)[name = string("op_5710_cast_fp16")];
+            bool var_5712_interleave_0 = const()[name = string("op_5712_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_5712_cast_fp16 = concat(axis = var_5590, interleave = var_5712_interleave_0, values = (var_5710_cast_fp16, var_5702_cast_fp16))[name = string("op_5712_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5713_cast_fp16 = mul(x = var_5712_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5713_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_87_cast_fp16 = add(x = var_5697_cast_fp16, y = var_5713_cast_fp16)[name = string("mh_q_87_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5715_cast_fp16 = mul(x = mh_k_57_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5715_cast_fp16")];
+            tensor<int32, [4]> var_5720_begin_0 = const()[name = string("op_5720_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5720_end_0 = const()[name = string("op_5720_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_5720_end_mask_0 = const()[name = string("op_5720_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5720_cast_fp16 = slice_by_index(begin = var_5720_begin_0, end = var_5720_end_0, end_mask = var_5720_end_mask_0, x = mh_k_57_cast_fp16)[name = string("op_5720_cast_fp16")];
+            tensor<int32, [4]> var_5726_begin_0 = const()[name = string("op_5726_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5726_end_0 = const()[name = string("op_5726_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_5726_end_mask_0 = const()[name = string("op_5726_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5726_cast_fp16 = slice_by_index(begin = var_5726_begin_0, end = var_5726_end_0, end_mask = var_5726_end_mask_0, x = mh_k_57_cast_fp16)[name = string("op_5726_cast_fp16")];
+            fp16 const_342_promoted_to_fp16 = const()[name = string("const_342_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_5728_cast_fp16 = mul(x = var_5726_cast_fp16, y = const_342_promoted_to_fp16)[name = string("op_5728_cast_fp16")];
+            bool var_5730_interleave_0 = const()[name = string("op_5730_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_5730_cast_fp16 = concat(axis = var_5590, interleave = var_5730_interleave_0, values = (var_5728_cast_fp16, var_5720_cast_fp16))[name = string("op_5730_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5731_cast_fp16 = mul(x = var_5730_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5731_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_59_cast_fp16 = add(x = var_5715_cast_fp16, y = var_5731_cast_fp16)[name = string("mh_k_59_cast_fp16")];
+            tensor<int32, [4]> var_5735 = const()[name = string("op_5735"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_59_cast_fp16 = reshape(shape = var_5735, x = mh_k_59_cast_fp16)[name = string("current_key_59_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5742_cast_fp16 = mul(x = var_101_cast_fp16_14, y = var_323_cast_fp16)[name = string("op_5742_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5743_cast_fp16 = mul(x = current_key_59_cast_fp16, y = var_321_cast_fp16)[name = string("op_5743_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_87_cast_fp16 = add(x = var_5742_cast_fp16, y = var_5743_cast_fp16)[name = string("key_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5746_cast_fp16 = mul(x = var_132_cast_fp16_14, y = var_323_cast_fp16)[name = string("op_5746_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5747_cast_fp16 = mul(x = current_value_29_cast_fp16, y = var_321_cast_fp16)[name = string("op_5747_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_57_cast_fp16 = add(x = var_5746_cast_fp16, y = var_5747_cast_fp16)[name = string("value_57_cast_fp16")];
+            tensor<int32, [4]> var_5751 = const()[name = string("op_5751"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_57_cast_fp16 = reshape(shape = var_5751, x = key_87_cast_fp16)[name = string("key_heads_57_cast_fp16")];
+            tensor<int32, [4]> var_5753 = const()[name = string("op_5753"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_57_cast_fp16 = reshape(shape = var_5753, x = value_57_cast_fp16)[name = string("value_heads_57_cast_fp16")];
+            tensor<int32, [4]> var_5756_begin_0 = const()[name = string("op_5756_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5756_end_0 = const()[name = string("op_5756_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5756_end_mask_0 = const()[name = string("op_5756_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5756_cast_fp16 = slice_by_index(begin = var_5756_begin_0, end = var_5756_end_0, end_mask = var_5756_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5756_cast_fp16")];
+            tensor<int32, [4]> var_5760_begin_0 = const()[name = string("op_5760_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5760_end_0 = const()[name = string("op_5760_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5760_end_mask_0 = const()[name = string("op_5760_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5760_cast_fp16 = slice_by_index(begin = var_5760_begin_0, end = var_5760_end_0, end_mask = var_5760_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5760_cast_fp16")];
+            tensor<int32, [4]> var_5772_begin_0 = const()[name = string("op_5772_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5772_end_0 = const()[name = string("op_5772_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5772_end_mask_0 = const()[name = string("op_5772_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5772_cast_fp16 = slice_by_index(begin = var_5772_begin_0, end = var_5772_end_0, end_mask = var_5772_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5772_cast_fp16")];
+            tensor<int32, [4]> var_5776_begin_0 = const()[name = string("op_5776_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5776_end_0 = const()[name = string("op_5776_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5776_end_mask_0 = const()[name = string("op_5776_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5776_cast_fp16 = slice_by_index(begin = var_5776_begin_0, end = var_5776_end_0, end_mask = var_5776_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5776_cast_fp16")];
+            tensor<int32, [4]> var_5788_begin_0 = const()[name = string("op_5788_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5788_end_0 = const()[name = string("op_5788_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5788_end_mask_0 = const()[name = string("op_5788_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5788_cast_fp16 = slice_by_index(begin = var_5788_begin_0, end = var_5788_end_0, end_mask = var_5788_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5788_cast_fp16")];
+            tensor<int32, [4]> var_5792_begin_0 = const()[name = string("op_5792_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5792_end_0 = const()[name = string("op_5792_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5792_end_mask_0 = const()[name = string("op_5792_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5792_cast_fp16 = slice_by_index(begin = var_5792_begin_0, end = var_5792_end_0, end_mask = var_5792_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5792_cast_fp16")];
+            tensor<int32, [4]> var_5804_begin_0 = const()[name = string("op_5804_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5804_end_0 = const()[name = string("op_5804_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5804_end_mask_0 = const()[name = string("op_5804_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5804_cast_fp16 = slice_by_index(begin = var_5804_begin_0, end = var_5804_end_0, end_mask = var_5804_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5804_cast_fp16")];
+            tensor<int32, [4]> var_5808_begin_0 = const()[name = string("op_5808_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5808_end_0 = const()[name = string("op_5808_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5808_end_mask_0 = const()[name = string("op_5808_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5808_cast_fp16 = slice_by_index(begin = var_5808_begin_0, end = var_5808_end_0, end_mask = var_5808_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5808_cast_fp16")];
+            tensor<int32, [4]> var_5820_begin_0 = const()[name = string("op_5820_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5820_end_0 = const()[name = string("op_5820_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5820_end_mask_0 = const()[name = string("op_5820_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5820_cast_fp16 = slice_by_index(begin = var_5820_begin_0, end = var_5820_end_0, end_mask = var_5820_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5820_cast_fp16")];
+            tensor<int32, [4]> var_5824_begin_0 = const()[name = string("op_5824_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5824_end_0 = const()[name = string("op_5824_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5824_end_mask_0 = const()[name = string("op_5824_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5824_cast_fp16 = slice_by_index(begin = var_5824_begin_0, end = var_5824_end_0, end_mask = var_5824_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5824_cast_fp16")];
+            tensor<int32, [4]> var_5836_begin_0 = const()[name = string("op_5836_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5836_end_0 = const()[name = string("op_5836_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5836_end_mask_0 = const()[name = string("op_5836_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5836_cast_fp16 = slice_by_index(begin = var_5836_begin_0, end = var_5836_end_0, end_mask = var_5836_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5836_cast_fp16")];
+            tensor<int32, [4]> var_5840_begin_0 = const()[name = string("op_5840_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5840_end_0 = const()[name = string("op_5840_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5840_end_mask_0 = const()[name = string("op_5840_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5840_cast_fp16 = slice_by_index(begin = var_5840_begin_0, end = var_5840_end_0, end_mask = var_5840_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5840_cast_fp16")];
+            tensor<int32, [4]> var_5852_begin_0 = const()[name = string("op_5852_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5852_end_0 = const()[name = string("op_5852_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5852_end_mask_0 = const()[name = string("op_5852_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5852_cast_fp16 = slice_by_index(begin = var_5852_begin_0, end = var_5852_end_0, end_mask = var_5852_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5852_cast_fp16")];
+            tensor<int32, [4]> var_5856_begin_0 = const()[name = string("op_5856_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5856_end_0 = const()[name = string("op_5856_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5856_end_mask_0 = const()[name = string("op_5856_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5856_cast_fp16 = slice_by_index(begin = var_5856_begin_0, end = var_5856_end_0, end_mask = var_5856_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5856_cast_fp16")];
+            tensor<int32, [4]> var_5868_begin_0 = const()[name = string("op_5868_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5868_end_0 = const()[name = string("op_5868_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5868_end_mask_0 = const()[name = string("op_5868_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5868_cast_fp16 = slice_by_index(begin = var_5868_begin_0, end = var_5868_end_0, end_mask = var_5868_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5868_cast_fp16")];
+            tensor<int32, [4]> var_5872_begin_0 = const()[name = string("op_5872_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5872_end_0 = const()[name = string("op_5872_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5872_end_mask_0 = const()[name = string("op_5872_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5872_cast_fp16 = slice_by_index(begin = var_5872_begin_0, end = var_5872_end_0, end_mask = var_5872_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5872_cast_fp16")];
+            bool key_heads_59_interleave_0 = const()[name = string("key_heads_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_59_cast_fp16 = concat(axis = var_5598, interleave = key_heads_59_interleave_0, values = (var_5756_cast_fp16, var_5756_cast_fp16, var_5772_cast_fp16, var_5772_cast_fp16, var_5788_cast_fp16, var_5788_cast_fp16, var_5804_cast_fp16, var_5804_cast_fp16, var_5820_cast_fp16, var_5820_cast_fp16, var_5836_cast_fp16, var_5836_cast_fp16, var_5852_cast_fp16, var_5852_cast_fp16, var_5868_cast_fp16, var_5868_cast_fp16))[name = string("key_heads_59_cast_fp16")];
+            bool value_heads_59_interleave_0 = const()[name = string("value_heads_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_59_cast_fp16 = concat(axis = var_5598, interleave = value_heads_59_interleave_0, values = (var_5760_cast_fp16, var_5760_cast_fp16, var_5776_cast_fp16, var_5776_cast_fp16, var_5792_cast_fp16, var_5792_cast_fp16, var_5808_cast_fp16, var_5808_cast_fp16, var_5824_cast_fp16, var_5824_cast_fp16, var_5840_cast_fp16, var_5840_cast_fp16, var_5856_cast_fp16, var_5856_cast_fp16, var_5872_cast_fp16, var_5872_cast_fp16))[name = string("value_heads_59_cast_fp16")];
+            fp16 var_5895_to_fp16 = const()[name = string("op_5895_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_5896_cast_fp16 = mul(x = mh_q_87_cast_fp16, y = var_5895_to_fp16)[name = string("op_5896_cast_fp16")];
+            bool mh_w_57_transpose_x_0 = const()[name = string("mh_w_57_transpose_x_0"), val = bool(true)];
+            bool mh_w_57_transpose_y_0 = const()[name = string("mh_w_57_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_57_cast_fp16 = matmul(transpose_x = mh_w_57_transpose_x_0, transpose_y = mh_w_57_transpose_y_0, x = var_5896_cast_fp16, y = key_heads_59_cast_fp16)[name = string("mh_w_57_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_59_cast_fp16 = add(x = mh_w_57_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_59_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_5908_cast_fp16 = softmax(axis = var_5580, x = mh_w_59_cast_fp16)[name = string("op_5908_cast_fp16")];
+            bool attn_29_transpose_x_0 = const()[name = string("attn_29_transpose_x_0"), val = bool(false)];
+            bool attn_29_transpose_y_0 = const()[name = string("attn_29_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_29_cast_fp16 = matmul(transpose_x = attn_29_transpose_x_0, transpose_y = attn_29_transpose_y_0, x = value_heads_59_cast_fp16, y = var_5908_cast_fp16)[name = string("attn_29_cast_fp16")];
+            tensor<int32, [4]> var_5913 = const()[name = string("op_5913"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_113_cast_fp16 = reshape(shape = var_5913, x = attn_29_cast_fp16)[name = string("input_113_cast_fp16")];
+            string obj_123_pad_type_0 = const()[name = string("obj_123_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_123_strides_0 = const()[name = string("obj_123_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_123_pad_0 = const()[name = string("obj_123_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_123_dilations_0 = const()[name = string("obj_123_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_123_groups_0 = const()[name = string("obj_123_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_14_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224604288))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226701504))))[name = string("layers_14_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_123_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_123_dilations_0, groups = obj_123_groups_0, pad = obj_123_pad_0, pad_type = obj_123_pad_type_0, strides = obj_123_strides_0, weight = layers_14_self_attn_o_proj_weight_to_fp16_palettized, x = input_113_cast_fp16)[name = string("obj_123_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_117_cast_fp16 = add(x = inputs_111_cast_fp16, y = obj_123_cast_fp16)[name = string("inputs_117_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_119_cast_fp16 = mul(x = inputs_117_cast_fp16, y = inputs_117_cast_fp16)[name = string("inputs_sq_119_cast_fp16")];
+            tensor<int32, [1]> variance_119_axes_0 = const()[name = string("variance_119_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_119_keep_dims_0 = const()[name = string("variance_119_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_119_cast_fp16 = reduce_mean(axes = variance_119_axes_0, keep_dims = variance_119_keep_dims_0, x = inputs_sq_119_cast_fp16)[name = string("variance_119_cast_fp16")];
+            fp16 var_5931_to_fp16 = const()[name = string("op_5931_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5932_cast_fp16 = add(x = variance_119_cast_fp16, y = var_5931_to_fp16)[name = string("op_5932_cast_fp16")];
+            fp32 var_5933_epsilon_0 = const()[name = string("op_5933_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5933_cast_fp16 = rsqrt(epsilon = var_5933_epsilon_0, x = var_5932_cast_fp16)[name = string("op_5933_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_147_cast_fp16 = mul(x = inputs_117_cast_fp16, y = var_5933_cast_fp16)[name = string("hidden_states_147_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_119_to_fp16 = const()[name = string("w_119_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226702080)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_115_cast_fp16 = mul(x = w_119_to_fp16, y = hidden_states_147_cast_fp16)[name = string("input_115_cast_fp16")];
+            string input_117_pad_type_0 = const()[name = string("input_117_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_117_strides_0 = const()[name = string("input_117_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_117_pad_0 = const()[name = string("input_117_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_117_dilations_0 = const()[name = string("input_117_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_117_groups_0 = const()[name = string("input_117_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_14_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226704192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229849984))))[name = string("layers_14_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_117_cast_fp16 = conv(dilations = input_117_dilations_0, groups = input_117_groups_0, pad = input_117_pad_0, pad_type = input_117_pad_type_0, strides = input_117_strides_0, weight = layers_14_mlp_gate_proj_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = string("input_117_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_5947_cast_fp16 = silu(x = input_117_cast_fp16)[name = string("op_5947_cast_fp16")];
+            string var_5953_pad_type_0 = const()[name = string("op_5953_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5953_strides_0 = const()[name = string("op_5953_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5953_pad_0 = const()[name = string("op_5953_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5953_dilations_0 = const()[name = string("op_5953_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5953_groups_0 = const()[name = string("op_5953_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_14_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229850560))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(232996352))))[name = string("layers_14_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_5953_cast_fp16 = conv(dilations = var_5953_dilations_0, groups = var_5953_groups_0, pad = var_5953_pad_0, pad_type = var_5953_pad_type_0, strides = var_5953_strides_0, weight = layers_14_mlp_up_proj_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = string("op_5953_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_119_cast_fp16 = mul(x = var_5947_cast_fp16, y = var_5953_cast_fp16)[name = string("input_119_cast_fp16")];
+            string hidden_states_149_pad_type_0 = const()[name = string("hidden_states_149_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_149_strides_0 = const()[name = string("hidden_states_149_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_149_pad_0 = const()[name = string("hidden_states_149_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_149_dilations_0 = const()[name = string("hidden_states_149_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_149_groups_0 = const()[name = string("hidden_states_149_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_14_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(232996928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(236142720))))[name = string("layers_14_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_149_cast_fp16 = conv(dilations = hidden_states_149_dilations_0, groups = hidden_states_149_groups_0, pad = hidden_states_149_pad_0, pad_type = hidden_states_149_pad_type_0, strides = hidden_states_149_strides_0, weight = layers_14_mlp_down_proj_weight_to_fp16_palettized, x = input_119_cast_fp16)[name = string("hidden_states_149_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_119_cast_fp16 = add(x = inputs_117_cast_fp16, y = hidden_states_149_cast_fp16)[name = string("inputs_119_cast_fp16")];
+            int32 var_5967 = const()[name = string("op_5967"), val = int32(3)];
+            int32 var_5977 = const()[name = string("op_5977"), val = int32(-2)];
+            int32 var_5985 = const()[name = string("op_5985"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_121_cast_fp16 = mul(x = inputs_119_cast_fp16, y = inputs_119_cast_fp16)[name = string("inputs_sq_121_cast_fp16")];
+            tensor<int32, [1]> variance_121_axes_0 = const()[name = string("variance_121_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_121_keep_dims_0 = const()[name = string("variance_121_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_121_cast_fp16 = reduce_mean(axes = variance_121_axes_0, keep_dims = variance_121_keep_dims_0, x = inputs_sq_121_cast_fp16)[name = string("variance_121_cast_fp16")];
+            fp16 var_5997_to_fp16 = const()[name = string("op_5997_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5998_cast_fp16 = add(x = variance_121_cast_fp16, y = var_5997_to_fp16)[name = string("op_5998_cast_fp16")];
+            fp32 var_5999_epsilon_0 = const()[name = string("op_5999_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5999_cast_fp16 = rsqrt(epsilon = var_5999_epsilon_0, x = var_5998_cast_fp16)[name = string("op_5999_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_151_cast_fp16 = mul(x = inputs_119_cast_fp16, y = var_5999_cast_fp16)[name = string("hidden_states_151_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_121_to_fp16 = const()[name = string("w_121_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(236143296)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_125_cast_fp16 = mul(x = w_121_to_fp16, y = hidden_states_151_cast_fp16)[name = string("obj_125_cast_fp16")];
+            string query_91_pad_type_0 = const()[name = string("query_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_91_strides_0 = const()[name = string("query_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_91_pad_0 = const()[name = string("query_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_91_dilations_0 = const()[name = string("query_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_91_groups_0 = const()[name = string("query_91_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_15_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(236145408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(238242624))))[name = string("layers_15_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_91_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_91_dilations_0, groups = query_91_groups_0, pad = query_91_pad_0, pad_type = query_91_pad_type_0, strides = query_91_strides_0, weight = layers_15_self_attn_q_proj_weight_to_fp16_palettized, x = obj_125_cast_fp16)[name = string("query_91_cast_fp16")];
+            string current_key_61_pad_type_0 = const()[name = string("current_key_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_61_strides_0 = const()[name = string("current_key_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_61_pad_0 = const()[name = string("current_key_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_61_dilations_0 = const()[name = string("current_key_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_61_groups_0 = const()[name = string("current_key_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(238243200))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239291840))))[name = string("layers_15_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_61_cast_fp16 = conv(dilations = current_key_61_dilations_0, groups = current_key_61_groups_0, pad = current_key_61_pad_0, pad_type = current_key_61_pad_type_0, strides = current_key_61_strides_0, weight = layers_15_self_attn_k_proj_weight_to_fp16_palettized, x = obj_125_cast_fp16)[name = string("current_key_61_cast_fp16")];
+            string current_value_31_pad_type_0 = const()[name = string("current_value_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_31_strides_0 = const()[name = string("current_value_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_31_pad_0 = const()[name = string("current_value_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_31_dilations_0 = const()[name = string("current_value_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_31_groups_0 = const()[name = string("current_value_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239292416))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(240341056))))[name = string("layers_15_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_31_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_31_dilations_0, groups = current_value_31_groups_0, pad = current_value_31_pad_0, pad_type = current_value_31_pad_type_0, strides = current_value_31_strides_0, weight = layers_15_self_attn_v_proj_weight_to_fp16_palettized, x = obj_125_cast_fp16)[name = string("current_value_31_cast_fp16")];
+            tensor<int32, [4]> var_6036 = const()[name = string("op_6036"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_121_cast_fp16 = reshape(shape = var_6036, x = query_91_cast_fp16)[name = string("inputs_121_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_123_cast_fp16 = mul(x = inputs_121_cast_fp16, y = inputs_121_cast_fp16)[name = string("inputs_sq_123_cast_fp16")];
+            tensor<int32, [1]> variance_123_axes_0 = const()[name = string("variance_123_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_123_keep_dims_0 = const()[name = string("variance_123_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_123_cast_fp16 = reduce_mean(axes = variance_123_axes_0, keep_dims = variance_123_keep_dims_0, x = inputs_sq_123_cast_fp16)[name = string("variance_123_cast_fp16")];
+            fp16 var_6042_to_fp16 = const()[name = string("op_6042_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_6043_cast_fp16 = add(x = variance_123_cast_fp16, y = var_6042_to_fp16)[name = string("op_6043_cast_fp16")];
+            fp32 var_6044_epsilon_0 = const()[name = string("op_6044_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_6044_cast_fp16 = rsqrt(epsilon = var_6044_epsilon_0, x = var_6043_cast_fp16)[name = string("op_6044_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_153_cast_fp16 = mul(x = inputs_121_cast_fp16, y = var_6044_cast_fp16)[name = string("hidden_states_153_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_123_to_fp16 = const()[name = string("w_123_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(240341632)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_31_cast_fp16 = mul(x = w_123_to_fp16, y = hidden_states_153_cast_fp16)[name = string("query_normed_31_cast_fp16")];
+            tensor<int32, [4]> var_6052 = const()[name = string("op_6052"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_123_cast_fp16 = reshape(shape = var_6052, x = current_key_61_cast_fp16)[name = string("inputs_123_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_125_cast_fp16 = mul(x = inputs_123_cast_fp16, y = inputs_123_cast_fp16)[name = string("inputs_sq_125_cast_fp16")];
+            tensor<int32, [1]> variance_125_axes_0 = const()[name = string("variance_125_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_125_keep_dims_0 = const()[name = string("variance_125_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_125_cast_fp16 = reduce_mean(axes = variance_125_axes_0, keep_dims = variance_125_keep_dims_0, x = inputs_sq_125_cast_fp16)[name = string("variance_125_cast_fp16")];
+            fp16 var_6058_to_fp16 = const()[name = string("op_6058_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_6059_cast_fp16 = add(x = variance_125_cast_fp16, y = var_6058_to_fp16)[name = string("op_6059_cast_fp16")];
+            fp32 var_6060_epsilon_0 = const()[name = string("op_6060_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_6060_cast_fp16 = rsqrt(epsilon = var_6060_epsilon_0, x = var_6059_cast_fp16)[name = string("op_6060_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_155_cast_fp16 = mul(x = inputs_123_cast_fp16, y = var_6060_cast_fp16)[name = string("hidden_states_155_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_125_to_fp16 = const()[name = string("w_125_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(240341952)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_31_cast_fp16 = mul(x = w_125_to_fp16, y = hidden_states_155_cast_fp16)[name = string("current_key_normed_31_cast_fp16")];
+            tensor<int32, [4]> var_6078 = const()[name = string("op_6078"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_91_cast_fp16 = reshape(shape = var_6078, x = query_normed_31_cast_fp16)[name = string("mh_q_91_cast_fp16")];
+            tensor<int32, [4]> var_6080 = const()[name = string("op_6080"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_61_cast_fp16 = reshape(shape = var_6080, x = current_key_normed_31_cast_fp16)[name = string("mh_k_61_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6084_cast_fp16 = mul(x = mh_q_91_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6084_cast_fp16")];
+            tensor<int32, [4]> var_6089_begin_0 = const()[name = string("op_6089_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6089_end_0 = const()[name = string("op_6089_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_6089_end_mask_0 = const()[name = string("op_6089_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6089_cast_fp16 = slice_by_index(begin = var_6089_begin_0, end = var_6089_end_0, end_mask = var_6089_end_mask_0, x = mh_q_91_cast_fp16)[name = string("op_6089_cast_fp16")];
+            tensor<int32, [4]> var_6095_begin_0 = const()[name = string("op_6095_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6095_end_0 = const()[name = string("op_6095_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_6095_end_mask_0 = const()[name = string("op_6095_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6095_cast_fp16 = slice_by_index(begin = var_6095_begin_0, end = var_6095_end_0, end_mask = var_6095_end_mask_0, x = mh_q_91_cast_fp16)[name = string("op_6095_cast_fp16")];
+            fp16 const_362_promoted_to_fp16 = const()[name = string("const_362_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_6097_cast_fp16 = mul(x = var_6095_cast_fp16, y = const_362_promoted_to_fp16)[name = string("op_6097_cast_fp16")];
+            bool var_6099_interleave_0 = const()[name = string("op_6099_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_6099_cast_fp16 = concat(axis = var_5977, interleave = var_6099_interleave_0, values = (var_6097_cast_fp16, var_6089_cast_fp16))[name = string("op_6099_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6100_cast_fp16 = mul(x = var_6099_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6100_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_93_cast_fp16 = add(x = var_6084_cast_fp16, y = var_6100_cast_fp16)[name = string("mh_q_93_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6102_cast_fp16 = mul(x = mh_k_61_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6102_cast_fp16")];
+            tensor<int32, [4]> var_6107_begin_0 = const()[name = string("op_6107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6107_end_0 = const()[name = string("op_6107_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_6107_end_mask_0 = const()[name = string("op_6107_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6107_cast_fp16 = slice_by_index(begin = var_6107_begin_0, end = var_6107_end_0, end_mask = var_6107_end_mask_0, x = mh_k_61_cast_fp16)[name = string("op_6107_cast_fp16")];
+            tensor<int32, [4]> var_6113_begin_0 = const()[name = string("op_6113_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6113_end_0 = const()[name = string("op_6113_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_6113_end_mask_0 = const()[name = string("op_6113_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6113_cast_fp16 = slice_by_index(begin = var_6113_begin_0, end = var_6113_end_0, end_mask = var_6113_end_mask_0, x = mh_k_61_cast_fp16)[name = string("op_6113_cast_fp16")];
+            fp16 const_365_promoted_to_fp16 = const()[name = string("const_365_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_6115_cast_fp16 = mul(x = var_6113_cast_fp16, y = const_365_promoted_to_fp16)[name = string("op_6115_cast_fp16")];
+            bool var_6117_interleave_0 = const()[name = string("op_6117_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_6117_cast_fp16 = concat(axis = var_5977, interleave = var_6117_interleave_0, values = (var_6115_cast_fp16, var_6107_cast_fp16))[name = string("op_6117_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6118_cast_fp16 = mul(x = var_6117_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6118_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_63_cast_fp16 = add(x = var_6102_cast_fp16, y = var_6118_cast_fp16)[name = string("mh_k_63_cast_fp16")];
+            tensor<int32, [4]> var_6122 = const()[name = string("op_6122"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_63_cast_fp16 = reshape(shape = var_6122, x = mh_k_63_cast_fp16)[name = string("current_key_63_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6129_cast_fp16 = mul(x = var_101_cast_fp16_15, y = var_323_cast_fp16)[name = string("op_6129_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6130_cast_fp16 = mul(x = current_key_63_cast_fp16, y = var_321_cast_fp16)[name = string("op_6130_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_93_cast_fp16 = add(x = var_6129_cast_fp16, y = var_6130_cast_fp16)[name = string("key_93_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6133_cast_fp16 = mul(x = var_132_cast_fp16_15, y = var_323_cast_fp16)[name = string("op_6133_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6134_cast_fp16 = mul(x = current_value_31_cast_fp16, y = var_321_cast_fp16)[name = string("op_6134_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_61_cast_fp16 = add(x = var_6133_cast_fp16, y = var_6134_cast_fp16)[name = string("value_61_cast_fp16")];
+            tensor<int32, [4]> var_6138 = const()[name = string("op_6138"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_61_cast_fp16 = reshape(shape = var_6138, x = key_93_cast_fp16)[name = string("key_heads_61_cast_fp16")];
+            tensor<int32, [4]> var_6140 = const()[name = string("op_6140"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_61_cast_fp16 = reshape(shape = var_6140, x = value_61_cast_fp16)[name = string("value_heads_61_cast_fp16")];
+            tensor<int32, [4]> var_6143_begin_0 = const()[name = string("op_6143_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6143_end_0 = const()[name = string("op_6143_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6143_end_mask_0 = const()[name = string("op_6143_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6143_cast_fp16 = slice_by_index(begin = var_6143_begin_0, end = var_6143_end_0, end_mask = var_6143_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6143_cast_fp16")];
+            tensor<int32, [4]> var_6147_begin_0 = const()[name = string("op_6147_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6147_end_0 = const()[name = string("op_6147_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6147_end_mask_0 = const()[name = string("op_6147_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6147_cast_fp16 = slice_by_index(begin = var_6147_begin_0, end = var_6147_end_0, end_mask = var_6147_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6147_cast_fp16")];
+            tensor<int32, [4]> var_6159_begin_0 = const()[name = string("op_6159_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6159_end_0 = const()[name = string("op_6159_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6159_end_mask_0 = const()[name = string("op_6159_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6159_cast_fp16 = slice_by_index(begin = var_6159_begin_0, end = var_6159_end_0, end_mask = var_6159_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6159_cast_fp16")];
+            tensor<int32, [4]> var_6163_begin_0 = const()[name = string("op_6163_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6163_end_0 = const()[name = string("op_6163_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6163_end_mask_0 = const()[name = string("op_6163_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6163_cast_fp16 = slice_by_index(begin = var_6163_begin_0, end = var_6163_end_0, end_mask = var_6163_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6163_cast_fp16")];
+            tensor<int32, [4]> var_6175_begin_0 = const()[name = string("op_6175_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6175_end_0 = const()[name = string("op_6175_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6175_end_mask_0 = const()[name = string("op_6175_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6175_cast_fp16 = slice_by_index(begin = var_6175_begin_0, end = var_6175_end_0, end_mask = var_6175_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6175_cast_fp16")];
+            tensor<int32, [4]> var_6179_begin_0 = const()[name = string("op_6179_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6179_end_0 = const()[name = string("op_6179_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6179_end_mask_0 = const()[name = string("op_6179_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6179_cast_fp16 = slice_by_index(begin = var_6179_begin_0, end = var_6179_end_0, end_mask = var_6179_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6179_cast_fp16")];
+            tensor<int32, [4]> var_6191_begin_0 = const()[name = string("op_6191_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6191_end_0 = const()[name = string("op_6191_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6191_end_mask_0 = const()[name = string("op_6191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6191_cast_fp16 = slice_by_index(begin = var_6191_begin_0, end = var_6191_end_0, end_mask = var_6191_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6191_cast_fp16")];
+            tensor<int32, [4]> var_6195_begin_0 = const()[name = string("op_6195_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6195_end_0 = const()[name = string("op_6195_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6195_end_mask_0 = const()[name = string("op_6195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6195_cast_fp16 = slice_by_index(begin = var_6195_begin_0, end = var_6195_end_0, end_mask = var_6195_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6195_cast_fp16")];
+            tensor<int32, [4]> var_6207_begin_0 = const()[name = string("op_6207_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6207_end_0 = const()[name = string("op_6207_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6207_end_mask_0 = const()[name = string("op_6207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6207_cast_fp16 = slice_by_index(begin = var_6207_begin_0, end = var_6207_end_0, end_mask = var_6207_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6207_cast_fp16")];
+            tensor<int32, [4]> var_6211_begin_0 = const()[name = string("op_6211_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6211_end_0 = const()[name = string("op_6211_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6211_end_mask_0 = const()[name = string("op_6211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6211_cast_fp16 = slice_by_index(begin = var_6211_begin_0, end = var_6211_end_0, end_mask = var_6211_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6211_cast_fp16")];
+            tensor<int32, [4]> var_6223_begin_0 = const()[name = string("op_6223_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6223_end_0 = const()[name = string("op_6223_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6223_end_mask_0 = const()[name = string("op_6223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6223_cast_fp16 = slice_by_index(begin = var_6223_begin_0, end = var_6223_end_0, end_mask = var_6223_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6223_cast_fp16")];
+            tensor<int32, [4]> var_6227_begin_0 = const()[name = string("op_6227_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6227_end_0 = const()[name = string("op_6227_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6227_end_mask_0 = const()[name = string("op_6227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6227_cast_fp16 = slice_by_index(begin = var_6227_begin_0, end = var_6227_end_0, end_mask = var_6227_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6227_cast_fp16")];
+            tensor<int32, [4]> var_6239_begin_0 = const()[name = string("op_6239_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6239_end_0 = const()[name = string("op_6239_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6239_end_mask_0 = const()[name = string("op_6239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6239_cast_fp16 = slice_by_index(begin = var_6239_begin_0, end = var_6239_end_0, end_mask = var_6239_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6239_cast_fp16")];
+            tensor<int32, [4]> var_6243_begin_0 = const()[name = string("op_6243_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6243_end_0 = const()[name = string("op_6243_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6243_end_mask_0 = const()[name = string("op_6243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6243_cast_fp16 = slice_by_index(begin = var_6243_begin_0, end = var_6243_end_0, end_mask = var_6243_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6243_cast_fp16")];
+            tensor<int32, [4]> var_6255_begin_0 = const()[name = string("op_6255_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6255_end_0 = const()[name = string("op_6255_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6255_end_mask_0 = const()[name = string("op_6255_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6255_cast_fp16 = slice_by_index(begin = var_6255_begin_0, end = var_6255_end_0, end_mask = var_6255_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6255_cast_fp16")];
+            tensor<int32, [4]> var_6259_begin_0 = const()[name = string("op_6259_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6259_end_0 = const()[name = string("op_6259_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6259_end_mask_0 = const()[name = string("op_6259_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6259_cast_fp16 = slice_by_index(begin = var_6259_begin_0, end = var_6259_end_0, end_mask = var_6259_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6259_cast_fp16")];
+            bool key_heads_63_interleave_0 = const()[name = string("key_heads_63_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_63_cast_fp16 = concat(axis = var_5985, interleave = key_heads_63_interleave_0, values = (var_6143_cast_fp16, var_6143_cast_fp16, var_6159_cast_fp16, var_6159_cast_fp16, var_6175_cast_fp16, var_6175_cast_fp16, var_6191_cast_fp16, var_6191_cast_fp16, var_6207_cast_fp16, var_6207_cast_fp16, var_6223_cast_fp16, var_6223_cast_fp16, var_6239_cast_fp16, var_6239_cast_fp16, var_6255_cast_fp16, var_6255_cast_fp16))[name = string("key_heads_63_cast_fp16")];
+            bool value_heads_63_interleave_0 = const()[name = string("value_heads_63_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_63_cast_fp16 = concat(axis = var_5985, interleave = value_heads_63_interleave_0, values = (var_6147_cast_fp16, var_6147_cast_fp16, var_6163_cast_fp16, var_6163_cast_fp16, var_6179_cast_fp16, var_6179_cast_fp16, var_6195_cast_fp16, var_6195_cast_fp16, var_6211_cast_fp16, var_6211_cast_fp16, var_6227_cast_fp16, var_6227_cast_fp16, var_6243_cast_fp16, var_6243_cast_fp16, var_6259_cast_fp16, var_6259_cast_fp16))[name = string("value_heads_63_cast_fp16")];
+            fp16 var_6282_to_fp16 = const()[name = string("op_6282_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_6283_cast_fp16 = mul(x = mh_q_93_cast_fp16, y = var_6282_to_fp16)[name = string("op_6283_cast_fp16")];
+            bool mh_w_61_transpose_x_0 = const()[name = string("mh_w_61_transpose_x_0"), val = bool(true)];
+            bool mh_w_61_transpose_y_0 = const()[name = string("mh_w_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_61_cast_fp16 = matmul(transpose_x = mh_w_61_transpose_x_0, transpose_y = mh_w_61_transpose_y_0, x = var_6283_cast_fp16, y = key_heads_63_cast_fp16)[name = string("mh_w_61_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_63_cast_fp16 = add(x = mh_w_61_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_63_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_6295_cast_fp16 = softmax(axis = var_5967, x = mh_w_63_cast_fp16)[name = string("op_6295_cast_fp16")];
+            bool attn_31_transpose_x_0 = const()[name = string("attn_31_transpose_x_0"), val = bool(false)];
+            bool attn_31_transpose_y_0 = const()[name = string("attn_31_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_31_cast_fp16 = matmul(transpose_x = attn_31_transpose_x_0, transpose_y = attn_31_transpose_y_0, x = value_heads_63_cast_fp16, y = var_6295_cast_fp16)[name = string("attn_31_cast_fp16")];
+            tensor<int32, [4]> var_6300 = const()[name = string("op_6300"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_121_cast_fp16 = reshape(shape = var_6300, x = attn_31_cast_fp16)[name = string("input_121_cast_fp16")];
+            string obj_131_pad_type_0 = const()[name = string("obj_131_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_131_strides_0 = const()[name = string("obj_131_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_131_pad_0 = const()[name = string("obj_131_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_131_dilations_0 = const()[name = string("obj_131_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_131_groups_0 = const()[name = string("obj_131_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_15_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(240342272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242439488))))[name = string("layers_15_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_131_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_131_dilations_0, groups = obj_131_groups_0, pad = obj_131_pad_0, pad_type = obj_131_pad_type_0, strides = obj_131_strides_0, weight = layers_15_self_attn_o_proj_weight_to_fp16_palettized, x = input_121_cast_fp16)[name = string("obj_131_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_125_cast_fp16 = add(x = inputs_119_cast_fp16, y = obj_131_cast_fp16)[name = string("inputs_125_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_127_cast_fp16 = mul(x = inputs_125_cast_fp16, y = inputs_125_cast_fp16)[name = string("inputs_sq_127_cast_fp16")];
+            tensor<int32, [1]> variance_127_axes_0 = const()[name = string("variance_127_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_127_keep_dims_0 = const()[name = string("variance_127_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_127_cast_fp16 = reduce_mean(axes = variance_127_axes_0, keep_dims = variance_127_keep_dims_0, x = inputs_sq_127_cast_fp16)[name = string("variance_127_cast_fp16")];
+            fp16 var_6318_to_fp16 = const()[name = string("op_6318_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6319_cast_fp16 = add(x = variance_127_cast_fp16, y = var_6318_to_fp16)[name = string("op_6319_cast_fp16")];
+            fp32 var_6320_epsilon_0 = const()[name = string("op_6320_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6320_cast_fp16 = rsqrt(epsilon = var_6320_epsilon_0, x = var_6319_cast_fp16)[name = string("op_6320_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_157_cast_fp16 = mul(x = inputs_125_cast_fp16, y = var_6320_cast_fp16)[name = string("hidden_states_157_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_127_to_fp16 = const()[name = string("w_127_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242440064)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_123_cast_fp16 = mul(x = w_127_to_fp16, y = hidden_states_157_cast_fp16)[name = string("input_123_cast_fp16")];
+            string input_125_pad_type_0 = const()[name = string("input_125_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_125_strides_0 = const()[name = string("input_125_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_125_pad_0 = const()[name = string("input_125_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_125_dilations_0 = const()[name = string("input_125_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_125_groups_0 = const()[name = string("input_125_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_15_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242442176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(245587968))))[name = string("layers_15_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_125_cast_fp16 = conv(dilations = input_125_dilations_0, groups = input_125_groups_0, pad = input_125_pad_0, pad_type = input_125_pad_type_0, strides = input_125_strides_0, weight = layers_15_mlp_gate_proj_weight_to_fp16_palettized, x = input_123_cast_fp16)[name = string("input_125_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_6334_cast_fp16 = silu(x = input_125_cast_fp16)[name = string("op_6334_cast_fp16")];
+            string var_6340_pad_type_0 = const()[name = string("op_6340_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6340_strides_0 = const()[name = string("op_6340_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6340_pad_0 = const()[name = string("op_6340_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6340_dilations_0 = const()[name = string("op_6340_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6340_groups_0 = const()[name = string("op_6340_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_15_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(245588544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(248734336))))[name = string("layers_15_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_6340_cast_fp16 = conv(dilations = var_6340_dilations_0, groups = var_6340_groups_0, pad = var_6340_pad_0, pad_type = var_6340_pad_type_0, strides = var_6340_strides_0, weight = layers_15_mlp_up_proj_weight_to_fp16_palettized, x = input_123_cast_fp16)[name = string("op_6340_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_127_cast_fp16 = mul(x = var_6334_cast_fp16, y = var_6340_cast_fp16)[name = string("input_127_cast_fp16")];
+            string hidden_states_159_pad_type_0 = const()[name = string("hidden_states_159_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_159_strides_0 = const()[name = string("hidden_states_159_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_159_pad_0 = const()[name = string("hidden_states_159_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_159_dilations_0 = const()[name = string("hidden_states_159_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_159_groups_0 = const()[name = string("hidden_states_159_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_15_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(248734912))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(251880704))))[name = string("layers_15_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_159_cast_fp16 = conv(dilations = hidden_states_159_dilations_0, groups = hidden_states_159_groups_0, pad = hidden_states_159_pad_0, pad_type = hidden_states_159_pad_type_0, strides = hidden_states_159_strides_0, weight = layers_15_mlp_down_proj_weight_to_fp16_palettized, x = input_127_cast_fp16)[name = string("hidden_states_159_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_127_cast_fp16 = add(x = inputs_125_cast_fp16, y = hidden_states_159_cast_fp16)[name = string("inputs_127_cast_fp16")];
+            int32 var_6354 = const()[name = string("op_6354"), val = int32(3)];
+            int32 var_6364 = const()[name = string("op_6364"), val = int32(-2)];
+            int32 var_6372 = const()[name = string("op_6372"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_129_cast_fp16 = mul(x = inputs_127_cast_fp16, y = inputs_127_cast_fp16)[name = string("inputs_sq_129_cast_fp16")];
+            tensor<int32, [1]> variance_129_axes_0 = const()[name = string("variance_129_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_129_keep_dims_0 = const()[name = string("variance_129_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_129_cast_fp16 = reduce_mean(axes = variance_129_axes_0, keep_dims = variance_129_keep_dims_0, x = inputs_sq_129_cast_fp16)[name = string("variance_129_cast_fp16")];
+            fp16 var_6384_to_fp16 = const()[name = string("op_6384_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6385_cast_fp16 = add(x = variance_129_cast_fp16, y = var_6384_to_fp16)[name = string("op_6385_cast_fp16")];
+            fp32 var_6386_epsilon_0 = const()[name = string("op_6386_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6386_cast_fp16 = rsqrt(epsilon = var_6386_epsilon_0, x = var_6385_cast_fp16)[name = string("op_6386_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_161_cast_fp16 = mul(x = inputs_127_cast_fp16, y = var_6386_cast_fp16)[name = string("hidden_states_161_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_129_to_fp16 = const()[name = string("w_129_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(251881280)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_133_cast_fp16 = mul(x = w_129_to_fp16, y = hidden_states_161_cast_fp16)[name = string("obj_133_cast_fp16")];
+            string query_97_pad_type_0 = const()[name = string("query_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_97_strides_0 = const()[name = string("query_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_97_pad_0 = const()[name = string("query_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_97_dilations_0 = const()[name = string("query_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_97_groups_0 = const()[name = string("query_97_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_16_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(251883392))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(253980608))))[name = string("layers_16_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_97_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_97_dilations_0, groups = query_97_groups_0, pad = query_97_pad_0, pad_type = query_97_pad_type_0, strides = query_97_strides_0, weight = layers_16_self_attn_q_proj_weight_to_fp16_palettized, x = obj_133_cast_fp16)[name = string("query_97_cast_fp16")];
+            string current_key_65_pad_type_0 = const()[name = string("current_key_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_65_strides_0 = const()[name = string("current_key_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_65_pad_0 = const()[name = string("current_key_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_65_dilations_0 = const()[name = string("current_key_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_65_groups_0 = const()[name = string("current_key_65_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(253981184))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(255029824))))[name = string("layers_16_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_65_cast_fp16 = conv(dilations = current_key_65_dilations_0, groups = current_key_65_groups_0, pad = current_key_65_pad_0, pad_type = current_key_65_pad_type_0, strides = current_key_65_strides_0, weight = layers_16_self_attn_k_proj_weight_to_fp16_palettized, x = obj_133_cast_fp16)[name = string("current_key_65_cast_fp16")];
+            string current_value_33_pad_type_0 = const()[name = string("current_value_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_33_strides_0 = const()[name = string("current_value_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_33_pad_0 = const()[name = string("current_value_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_33_dilations_0 = const()[name = string("current_value_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_33_groups_0 = const()[name = string("current_value_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(255030400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256079040))))[name = string("layers_16_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_33_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_33_dilations_0, groups = current_value_33_groups_0, pad = current_value_33_pad_0, pad_type = current_value_33_pad_type_0, strides = current_value_33_strides_0, weight = layers_16_self_attn_v_proj_weight_to_fp16_palettized, x = obj_133_cast_fp16)[name = string("current_value_33_cast_fp16")];
+            tensor<int32, [4]> var_6423 = const()[name = string("op_6423"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_129_cast_fp16 = reshape(shape = var_6423, x = query_97_cast_fp16)[name = string("inputs_129_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_131_cast_fp16 = mul(x = inputs_129_cast_fp16, y = inputs_129_cast_fp16)[name = string("inputs_sq_131_cast_fp16")];
+            tensor<int32, [1]> variance_131_axes_0 = const()[name = string("variance_131_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_131_keep_dims_0 = const()[name = string("variance_131_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_131_cast_fp16 = reduce_mean(axes = variance_131_axes_0, keep_dims = variance_131_keep_dims_0, x = inputs_sq_131_cast_fp16)[name = string("variance_131_cast_fp16")];
+            fp16 var_6429_to_fp16 = const()[name = string("op_6429_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_6430_cast_fp16 = add(x = variance_131_cast_fp16, y = var_6429_to_fp16)[name = string("op_6430_cast_fp16")];
+            fp32 var_6431_epsilon_0 = const()[name = string("op_6431_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_6431_cast_fp16 = rsqrt(epsilon = var_6431_epsilon_0, x = var_6430_cast_fp16)[name = string("op_6431_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_163_cast_fp16 = mul(x = inputs_129_cast_fp16, y = var_6431_cast_fp16)[name = string("hidden_states_163_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_131_to_fp16 = const()[name = string("w_131_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256079616)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_33_cast_fp16 = mul(x = w_131_to_fp16, y = hidden_states_163_cast_fp16)[name = string("query_normed_33_cast_fp16")];
+            tensor<int32, [4]> var_6439 = const()[name = string("op_6439"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_131_cast_fp16 = reshape(shape = var_6439, x = current_key_65_cast_fp16)[name = string("inputs_131_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_133_cast_fp16 = mul(x = inputs_131_cast_fp16, y = inputs_131_cast_fp16)[name = string("inputs_sq_133_cast_fp16")];
+            tensor<int32, [1]> variance_133_axes_0 = const()[name = string("variance_133_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_133_keep_dims_0 = const()[name = string("variance_133_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_133_cast_fp16 = reduce_mean(axes = variance_133_axes_0, keep_dims = variance_133_keep_dims_0, x = inputs_sq_133_cast_fp16)[name = string("variance_133_cast_fp16")];
+            fp16 var_6445_to_fp16 = const()[name = string("op_6445_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_6446_cast_fp16 = add(x = variance_133_cast_fp16, y = var_6445_to_fp16)[name = string("op_6446_cast_fp16")];
+            fp32 var_6447_epsilon_0 = const()[name = string("op_6447_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_6447_cast_fp16 = rsqrt(epsilon = var_6447_epsilon_0, x = var_6446_cast_fp16)[name = string("op_6447_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_165_cast_fp16 = mul(x = inputs_131_cast_fp16, y = var_6447_cast_fp16)[name = string("hidden_states_165_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_133_to_fp16 = const()[name = string("w_133_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256079936)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_33_cast_fp16 = mul(x = w_133_to_fp16, y = hidden_states_165_cast_fp16)[name = string("current_key_normed_33_cast_fp16")];
+            tensor<int32, [4]> var_6465 = const()[name = string("op_6465"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_97_cast_fp16 = reshape(shape = var_6465, x = query_normed_33_cast_fp16)[name = string("mh_q_97_cast_fp16")];
+            tensor<int32, [4]> var_6467 = const()[name = string("op_6467"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_65_cast_fp16 = reshape(shape = var_6467, x = current_key_normed_33_cast_fp16)[name = string("mh_k_65_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6471_cast_fp16 = mul(x = mh_q_97_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6471_cast_fp16")];
+            tensor<int32, [4]> var_6476_begin_0 = const()[name = string("op_6476_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6476_end_0 = const()[name = string("op_6476_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_6476_end_mask_0 = const()[name = string("op_6476_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6476_cast_fp16 = slice_by_index(begin = var_6476_begin_0, end = var_6476_end_0, end_mask = var_6476_end_mask_0, x = mh_q_97_cast_fp16)[name = string("op_6476_cast_fp16")];
+            tensor<int32, [4]> var_6482_begin_0 = const()[name = string("op_6482_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6482_end_0 = const()[name = string("op_6482_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_6482_end_mask_0 = const()[name = string("op_6482_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6482_cast_fp16 = slice_by_index(begin = var_6482_begin_0, end = var_6482_end_0, end_mask = var_6482_end_mask_0, x = mh_q_97_cast_fp16)[name = string("op_6482_cast_fp16")];
+            fp16 const_385_promoted_to_fp16 = const()[name = string("const_385_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_6484_cast_fp16 = mul(x = var_6482_cast_fp16, y = const_385_promoted_to_fp16)[name = string("op_6484_cast_fp16")];
+            bool var_6486_interleave_0 = const()[name = string("op_6486_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_6486_cast_fp16 = concat(axis = var_6364, interleave = var_6486_interleave_0, values = (var_6484_cast_fp16, var_6476_cast_fp16))[name = string("op_6486_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6487_cast_fp16 = mul(x = var_6486_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6487_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_99_cast_fp16 = add(x = var_6471_cast_fp16, y = var_6487_cast_fp16)[name = string("mh_q_99_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6489_cast_fp16 = mul(x = mh_k_65_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6489_cast_fp16")];
+            tensor<int32, [4]> var_6494_begin_0 = const()[name = string("op_6494_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6494_end_0 = const()[name = string("op_6494_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_6494_end_mask_0 = const()[name = string("op_6494_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6494_cast_fp16 = slice_by_index(begin = var_6494_begin_0, end = var_6494_end_0, end_mask = var_6494_end_mask_0, x = mh_k_65_cast_fp16)[name = string("op_6494_cast_fp16")];
+            tensor<int32, [4]> var_6500_begin_0 = const()[name = string("op_6500_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6500_end_0 = const()[name = string("op_6500_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_6500_end_mask_0 = const()[name = string("op_6500_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6500_cast_fp16 = slice_by_index(begin = var_6500_begin_0, end = var_6500_end_0, end_mask = var_6500_end_mask_0, x = mh_k_65_cast_fp16)[name = string("op_6500_cast_fp16")];
+            fp16 const_388_promoted_to_fp16 = const()[name = string("const_388_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_6502_cast_fp16 = mul(x = var_6500_cast_fp16, y = const_388_promoted_to_fp16)[name = string("op_6502_cast_fp16")];
+            bool var_6504_interleave_0 = const()[name = string("op_6504_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_6504_cast_fp16 = concat(axis = var_6364, interleave = var_6504_interleave_0, values = (var_6502_cast_fp16, var_6494_cast_fp16))[name = string("op_6504_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6505_cast_fp16 = mul(x = var_6504_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6505_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_67_cast_fp16 = add(x = var_6489_cast_fp16, y = var_6505_cast_fp16)[name = string("mh_k_67_cast_fp16")];
+            tensor<int32, [4]> var_6509 = const()[name = string("op_6509"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_67_cast_fp16 = reshape(shape = var_6509, x = mh_k_67_cast_fp16)[name = string("current_key_67_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6516_cast_fp16 = mul(x = var_101_cast_fp16_16, y = var_323_cast_fp16)[name = string("op_6516_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6517_cast_fp16 = mul(x = current_key_67_cast_fp16, y = var_321_cast_fp16)[name = string("op_6517_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_99_cast_fp16 = add(x = var_6516_cast_fp16, y = var_6517_cast_fp16)[name = string("key_99_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6520_cast_fp16 = mul(x = var_132_cast_fp16_16, y = var_323_cast_fp16)[name = string("op_6520_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6521_cast_fp16 = mul(x = current_value_33_cast_fp16, y = var_321_cast_fp16)[name = string("op_6521_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_65_cast_fp16 = add(x = var_6520_cast_fp16, y = var_6521_cast_fp16)[name = string("value_65_cast_fp16")];
+            tensor<int32, [4]> var_6525 = const()[name = string("op_6525"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_65_cast_fp16 = reshape(shape = var_6525, x = key_99_cast_fp16)[name = string("key_heads_65_cast_fp16")];
+            tensor<int32, [4]> var_6527 = const()[name = string("op_6527"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_65_cast_fp16 = reshape(shape = var_6527, x = value_65_cast_fp16)[name = string("value_heads_65_cast_fp16")];
+            tensor<int32, [4]> var_6530_begin_0 = const()[name = string("op_6530_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6530_end_0 = const()[name = string("op_6530_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6530_end_mask_0 = const()[name = string("op_6530_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6530_cast_fp16 = slice_by_index(begin = var_6530_begin_0, end = var_6530_end_0, end_mask = var_6530_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6530_cast_fp16")];
+            tensor<int32, [4]> var_6534_begin_0 = const()[name = string("op_6534_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6534_end_0 = const()[name = string("op_6534_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6534_end_mask_0 = const()[name = string("op_6534_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6534_cast_fp16 = slice_by_index(begin = var_6534_begin_0, end = var_6534_end_0, end_mask = var_6534_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6534_cast_fp16")];
+            tensor<int32, [4]> var_6546_begin_0 = const()[name = string("op_6546_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6546_end_0 = const()[name = string("op_6546_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6546_end_mask_0 = const()[name = string("op_6546_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6546_cast_fp16 = slice_by_index(begin = var_6546_begin_0, end = var_6546_end_0, end_mask = var_6546_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6546_cast_fp16")];
+            tensor<int32, [4]> var_6550_begin_0 = const()[name = string("op_6550_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6550_end_0 = const()[name = string("op_6550_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6550_end_mask_0 = const()[name = string("op_6550_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6550_cast_fp16 = slice_by_index(begin = var_6550_begin_0, end = var_6550_end_0, end_mask = var_6550_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6550_cast_fp16")];
+            tensor<int32, [4]> var_6562_begin_0 = const()[name = string("op_6562_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6562_end_0 = const()[name = string("op_6562_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6562_end_mask_0 = const()[name = string("op_6562_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6562_cast_fp16 = slice_by_index(begin = var_6562_begin_0, end = var_6562_end_0, end_mask = var_6562_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6562_cast_fp16")];
+            tensor<int32, [4]> var_6566_begin_0 = const()[name = string("op_6566_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6566_end_0 = const()[name = string("op_6566_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6566_end_mask_0 = const()[name = string("op_6566_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6566_cast_fp16 = slice_by_index(begin = var_6566_begin_0, end = var_6566_end_0, end_mask = var_6566_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6566_cast_fp16")];
+            tensor<int32, [4]> var_6578_begin_0 = const()[name = string("op_6578_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6578_end_0 = const()[name = string("op_6578_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6578_end_mask_0 = const()[name = string("op_6578_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6578_cast_fp16 = slice_by_index(begin = var_6578_begin_0, end = var_6578_end_0, end_mask = var_6578_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6578_cast_fp16")];
+            tensor<int32, [4]> var_6582_begin_0 = const()[name = string("op_6582_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6582_end_0 = const()[name = string("op_6582_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6582_end_mask_0 = const()[name = string("op_6582_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6582_cast_fp16 = slice_by_index(begin = var_6582_begin_0, end = var_6582_end_0, end_mask = var_6582_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6582_cast_fp16")];
+            tensor<int32, [4]> var_6594_begin_0 = const()[name = string("op_6594_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6594_end_0 = const()[name = string("op_6594_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6594_end_mask_0 = const()[name = string("op_6594_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6594_cast_fp16 = slice_by_index(begin = var_6594_begin_0, end = var_6594_end_0, end_mask = var_6594_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6594_cast_fp16")];
+            tensor<int32, [4]> var_6598_begin_0 = const()[name = string("op_6598_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6598_end_0 = const()[name = string("op_6598_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6598_end_mask_0 = const()[name = string("op_6598_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6598_cast_fp16 = slice_by_index(begin = var_6598_begin_0, end = var_6598_end_0, end_mask = var_6598_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6598_cast_fp16")];
+            tensor<int32, [4]> var_6610_begin_0 = const()[name = string("op_6610_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6610_end_0 = const()[name = string("op_6610_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6610_end_mask_0 = const()[name = string("op_6610_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6610_cast_fp16 = slice_by_index(begin = var_6610_begin_0, end = var_6610_end_0, end_mask = var_6610_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6610_cast_fp16")];
+            tensor<int32, [4]> var_6614_begin_0 = const()[name = string("op_6614_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6614_end_0 = const()[name = string("op_6614_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6614_end_mask_0 = const()[name = string("op_6614_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6614_cast_fp16 = slice_by_index(begin = var_6614_begin_0, end = var_6614_end_0, end_mask = var_6614_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6614_cast_fp16")];
+            tensor<int32, [4]> var_6626_begin_0 = const()[name = string("op_6626_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6626_end_0 = const()[name = string("op_6626_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6626_end_mask_0 = const()[name = string("op_6626_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6626_cast_fp16 = slice_by_index(begin = var_6626_begin_0, end = var_6626_end_0, end_mask = var_6626_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6626_cast_fp16")];
+            tensor<int32, [4]> var_6630_begin_0 = const()[name = string("op_6630_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6630_end_0 = const()[name = string("op_6630_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6630_end_mask_0 = const()[name = string("op_6630_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6630_cast_fp16 = slice_by_index(begin = var_6630_begin_0, end = var_6630_end_0, end_mask = var_6630_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6630_cast_fp16")];
+            tensor<int32, [4]> var_6642_begin_0 = const()[name = string("op_6642_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6642_end_0 = const()[name = string("op_6642_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6642_end_mask_0 = const()[name = string("op_6642_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6642_cast_fp16 = slice_by_index(begin = var_6642_begin_0, end = var_6642_end_0, end_mask = var_6642_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6642_cast_fp16")];
+            tensor<int32, [4]> var_6646_begin_0 = const()[name = string("op_6646_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6646_end_0 = const()[name = string("op_6646_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6646_end_mask_0 = const()[name = string("op_6646_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6646_cast_fp16 = slice_by_index(begin = var_6646_begin_0, end = var_6646_end_0, end_mask = var_6646_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6646_cast_fp16")];
+            bool key_heads_67_interleave_0 = const()[name = string("key_heads_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_67_cast_fp16 = concat(axis = var_6372, interleave = key_heads_67_interleave_0, values = (var_6530_cast_fp16, var_6530_cast_fp16, var_6546_cast_fp16, var_6546_cast_fp16, var_6562_cast_fp16, var_6562_cast_fp16, var_6578_cast_fp16, var_6578_cast_fp16, var_6594_cast_fp16, var_6594_cast_fp16, var_6610_cast_fp16, var_6610_cast_fp16, var_6626_cast_fp16, var_6626_cast_fp16, var_6642_cast_fp16, var_6642_cast_fp16))[name = string("key_heads_67_cast_fp16")];
+            bool value_heads_67_interleave_0 = const()[name = string("value_heads_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_67_cast_fp16 = concat(axis = var_6372, interleave = value_heads_67_interleave_0, values = (var_6534_cast_fp16, var_6534_cast_fp16, var_6550_cast_fp16, var_6550_cast_fp16, var_6566_cast_fp16, var_6566_cast_fp16, var_6582_cast_fp16, var_6582_cast_fp16, var_6598_cast_fp16, var_6598_cast_fp16, var_6614_cast_fp16, var_6614_cast_fp16, var_6630_cast_fp16, var_6630_cast_fp16, var_6646_cast_fp16, var_6646_cast_fp16))[name = string("value_heads_67_cast_fp16")];
+            fp16 var_6669_to_fp16 = const()[name = string("op_6669_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_6670_cast_fp16 = mul(x = mh_q_99_cast_fp16, y = var_6669_to_fp16)[name = string("op_6670_cast_fp16")];
+            bool mh_w_65_transpose_x_0 = const()[name = string("mh_w_65_transpose_x_0"), val = bool(true)];
+            bool mh_w_65_transpose_y_0 = const()[name = string("mh_w_65_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_65_cast_fp16 = matmul(transpose_x = mh_w_65_transpose_x_0, transpose_y = mh_w_65_transpose_y_0, x = var_6670_cast_fp16, y = key_heads_67_cast_fp16)[name = string("mh_w_65_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_67_cast_fp16 = add(x = mh_w_65_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_67_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_6682_cast_fp16 = softmax(axis = var_6354, x = mh_w_67_cast_fp16)[name = string("op_6682_cast_fp16")];
+            bool attn_33_transpose_x_0 = const()[name = string("attn_33_transpose_x_0"), val = bool(false)];
+            bool attn_33_transpose_y_0 = const()[name = string("attn_33_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_33_cast_fp16 = matmul(transpose_x = attn_33_transpose_x_0, transpose_y = attn_33_transpose_y_0, x = value_heads_67_cast_fp16, y = var_6682_cast_fp16)[name = string("attn_33_cast_fp16")];
+            tensor<int32, [4]> var_6687 = const()[name = string("op_6687"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_129_cast_fp16 = reshape(shape = var_6687, x = attn_33_cast_fp16)[name = string("input_129_cast_fp16")];
+            string obj_139_pad_type_0 = const()[name = string("obj_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_139_strides_0 = const()[name = string("obj_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_139_pad_0 = const()[name = string("obj_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_139_dilations_0 = const()[name = string("obj_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_139_groups_0 = const()[name = string("obj_139_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_16_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256080256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258177472))))[name = string("layers_16_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_139_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_139_dilations_0, groups = obj_139_groups_0, pad = obj_139_pad_0, pad_type = obj_139_pad_type_0, strides = obj_139_strides_0, weight = layers_16_self_attn_o_proj_weight_to_fp16_palettized, x = input_129_cast_fp16)[name = string("obj_139_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_133_cast_fp16 = add(x = inputs_127_cast_fp16, y = obj_139_cast_fp16)[name = string("inputs_133_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_135_cast_fp16 = mul(x = inputs_133_cast_fp16, y = inputs_133_cast_fp16)[name = string("inputs_sq_135_cast_fp16")];
+            tensor<int32, [1]> variance_135_axes_0 = const()[name = string("variance_135_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_135_keep_dims_0 = const()[name = string("variance_135_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_135_cast_fp16 = reduce_mean(axes = variance_135_axes_0, keep_dims = variance_135_keep_dims_0, x = inputs_sq_135_cast_fp16)[name = string("variance_135_cast_fp16")];
+            fp16 var_6705_to_fp16 = const()[name = string("op_6705_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6706_cast_fp16 = add(x = variance_135_cast_fp16, y = var_6705_to_fp16)[name = string("op_6706_cast_fp16")];
+            fp32 var_6707_epsilon_0 = const()[name = string("op_6707_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6707_cast_fp16 = rsqrt(epsilon = var_6707_epsilon_0, x = var_6706_cast_fp16)[name = string("op_6707_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_167_cast_fp16 = mul(x = inputs_133_cast_fp16, y = var_6707_cast_fp16)[name = string("hidden_states_167_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_135_to_fp16 = const()[name = string("w_135_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258178048)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_131_cast_fp16 = mul(x = w_135_to_fp16, y = hidden_states_167_cast_fp16)[name = string("input_131_cast_fp16")];
+            string input_133_pad_type_0 = const()[name = string("input_133_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_133_strides_0 = const()[name = string("input_133_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_133_pad_0 = const()[name = string("input_133_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_133_dilations_0 = const()[name = string("input_133_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_133_groups_0 = const()[name = string("input_133_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_16_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258180160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(261325952))))[name = string("layers_16_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_133_cast_fp16 = conv(dilations = input_133_dilations_0, groups = input_133_groups_0, pad = input_133_pad_0, pad_type = input_133_pad_type_0, strides = input_133_strides_0, weight = layers_16_mlp_gate_proj_weight_to_fp16_palettized, x = input_131_cast_fp16)[name = string("input_133_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_6721_cast_fp16 = silu(x = input_133_cast_fp16)[name = string("op_6721_cast_fp16")];
+            string var_6727_pad_type_0 = const()[name = string("op_6727_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6727_strides_0 = const()[name = string("op_6727_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6727_pad_0 = const()[name = string("op_6727_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6727_dilations_0 = const()[name = string("op_6727_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6727_groups_0 = const()[name = string("op_6727_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_16_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(261326528))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(264472320))))[name = string("layers_16_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_6727_cast_fp16 = conv(dilations = var_6727_dilations_0, groups = var_6727_groups_0, pad = var_6727_pad_0, pad_type = var_6727_pad_type_0, strides = var_6727_strides_0, weight = layers_16_mlp_up_proj_weight_to_fp16_palettized, x = input_131_cast_fp16)[name = string("op_6727_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_135_cast_fp16 = mul(x = var_6721_cast_fp16, y = var_6727_cast_fp16)[name = string("input_135_cast_fp16")];
+            string hidden_states_169_pad_type_0 = const()[name = string("hidden_states_169_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_169_strides_0 = const()[name = string("hidden_states_169_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_169_pad_0 = const()[name = string("hidden_states_169_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_169_dilations_0 = const()[name = string("hidden_states_169_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_169_groups_0 = const()[name = string("hidden_states_169_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_16_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(264472896))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(267618688))))[name = string("layers_16_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_169_cast_fp16 = conv(dilations = hidden_states_169_dilations_0, groups = hidden_states_169_groups_0, pad = hidden_states_169_pad_0, pad_type = hidden_states_169_pad_type_0, strides = hidden_states_169_strides_0, weight = layers_16_mlp_down_proj_weight_to_fp16_palettized, x = input_135_cast_fp16)[name = string("hidden_states_169_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_135_cast_fp16 = add(x = inputs_133_cast_fp16, y = hidden_states_169_cast_fp16)[name = string("inputs_135_cast_fp16")];
+            int32 var_6741 = const()[name = string("op_6741"), val = int32(3)];
+            int32 var_6751 = const()[name = string("op_6751"), val = int32(-2)];
+            int32 var_6759 = const()[name = string("op_6759"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_137_cast_fp16 = mul(x = inputs_135_cast_fp16, y = inputs_135_cast_fp16)[name = string("inputs_sq_137_cast_fp16")];
+            tensor<int32, [1]> variance_137_axes_0 = const()[name = string("variance_137_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_137_keep_dims_0 = const()[name = string("variance_137_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_137_cast_fp16 = reduce_mean(axes = variance_137_axes_0, keep_dims = variance_137_keep_dims_0, x = inputs_sq_137_cast_fp16)[name = string("variance_137_cast_fp16")];
+            fp16 var_6771_to_fp16 = const()[name = string("op_6771_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6772_cast_fp16 = add(x = variance_137_cast_fp16, y = var_6771_to_fp16)[name = string("op_6772_cast_fp16")];
+            fp32 var_6773_epsilon_0 = const()[name = string("op_6773_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6773_cast_fp16 = rsqrt(epsilon = var_6773_epsilon_0, x = var_6772_cast_fp16)[name = string("op_6773_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_171_cast_fp16 = mul(x = inputs_135_cast_fp16, y = var_6773_cast_fp16)[name = string("hidden_states_171_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_137_to_fp16 = const()[name = string("w_137_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(267619264)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_141_cast_fp16 = mul(x = w_137_to_fp16, y = hidden_states_171_cast_fp16)[name = string("obj_141_cast_fp16")];
+            string query_103_pad_type_0 = const()[name = string("query_103_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_103_strides_0 = const()[name = string("query_103_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_103_pad_0 = const()[name = string("query_103_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_103_dilations_0 = const()[name = string("query_103_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_103_groups_0 = const()[name = string("query_103_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_17_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(267621376))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269718592))))[name = string("layers_17_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_103_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_103_dilations_0, groups = query_103_groups_0, pad = query_103_pad_0, pad_type = query_103_pad_type_0, strides = query_103_strides_0, weight = layers_17_self_attn_q_proj_weight_to_fp16_palettized, x = obj_141_cast_fp16)[name = string("query_103_cast_fp16")];
+            string current_key_69_pad_type_0 = const()[name = string("current_key_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_69_strides_0 = const()[name = string("current_key_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_69_pad_0 = const()[name = string("current_key_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_69_dilations_0 = const()[name = string("current_key_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_69_groups_0 = const()[name = string("current_key_69_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269719168))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(270767808))))[name = string("layers_17_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_69_cast_fp16 = conv(dilations = current_key_69_dilations_0, groups = current_key_69_groups_0, pad = current_key_69_pad_0, pad_type = current_key_69_pad_type_0, strides = current_key_69_strides_0, weight = layers_17_self_attn_k_proj_weight_to_fp16_palettized, x = obj_141_cast_fp16)[name = string("current_key_69_cast_fp16")];
+            string current_value_35_pad_type_0 = const()[name = string("current_value_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_35_strides_0 = const()[name = string("current_value_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_35_pad_0 = const()[name = string("current_value_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_35_dilations_0 = const()[name = string("current_value_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_35_groups_0 = const()[name = string("current_value_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(270768384))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(271817024))))[name = string("layers_17_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_35_dilations_0, groups = current_value_35_groups_0, pad = current_value_35_pad_0, pad_type = current_value_35_pad_type_0, strides = current_value_35_strides_0, weight = layers_17_self_attn_v_proj_weight_to_fp16_palettized, x = obj_141_cast_fp16)[name = string("current_value_35_cast_fp16")];
+            tensor<int32, [4]> var_6810 = const()[name = string("op_6810"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_137_cast_fp16 = reshape(shape = var_6810, x = query_103_cast_fp16)[name = string("inputs_137_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_139_cast_fp16 = mul(x = inputs_137_cast_fp16, y = inputs_137_cast_fp16)[name = string("inputs_sq_139_cast_fp16")];
+            tensor<int32, [1]> variance_139_axes_0 = const()[name = string("variance_139_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_139_keep_dims_0 = const()[name = string("variance_139_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_139_cast_fp16 = reduce_mean(axes = variance_139_axes_0, keep_dims = variance_139_keep_dims_0, x = inputs_sq_139_cast_fp16)[name = string("variance_139_cast_fp16")];
+            fp16 var_6816_to_fp16 = const()[name = string("op_6816_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_6817_cast_fp16 = add(x = variance_139_cast_fp16, y = var_6816_to_fp16)[name = string("op_6817_cast_fp16")];
+            fp32 var_6818_epsilon_0 = const()[name = string("op_6818_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_6818_cast_fp16 = rsqrt(epsilon = var_6818_epsilon_0, x = var_6817_cast_fp16)[name = string("op_6818_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_173_cast_fp16 = mul(x = inputs_137_cast_fp16, y = var_6818_cast_fp16)[name = string("hidden_states_173_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_139_to_fp16 = const()[name = string("w_139_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(271817600)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_35_cast_fp16 = mul(x = w_139_to_fp16, y = hidden_states_173_cast_fp16)[name = string("query_normed_35_cast_fp16")];
+            tensor<int32, [4]> var_6826 = const()[name = string("op_6826"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_139_cast_fp16 = reshape(shape = var_6826, x = current_key_69_cast_fp16)[name = string("inputs_139_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_141_cast_fp16 = mul(x = inputs_139_cast_fp16, y = inputs_139_cast_fp16)[name = string("inputs_sq_141_cast_fp16")];
+            tensor<int32, [1]> variance_141_axes_0 = const()[name = string("variance_141_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_141_keep_dims_0 = const()[name = string("variance_141_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_141_cast_fp16 = reduce_mean(axes = variance_141_axes_0, keep_dims = variance_141_keep_dims_0, x = inputs_sq_141_cast_fp16)[name = string("variance_141_cast_fp16")];
+            fp16 var_6832_to_fp16 = const()[name = string("op_6832_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_6833_cast_fp16 = add(x = variance_141_cast_fp16, y = var_6832_to_fp16)[name = string("op_6833_cast_fp16")];
+            fp32 var_6834_epsilon_0 = const()[name = string("op_6834_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_6834_cast_fp16 = rsqrt(epsilon = var_6834_epsilon_0, x = var_6833_cast_fp16)[name = string("op_6834_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_175_cast_fp16 = mul(x = inputs_139_cast_fp16, y = var_6834_cast_fp16)[name = string("hidden_states_175_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_141_to_fp16 = const()[name = string("w_141_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(271817920)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_35_cast_fp16 = mul(x = w_141_to_fp16, y = hidden_states_175_cast_fp16)[name = string("current_key_normed_35_cast_fp16")];
+            tensor<int32, [4]> var_6852 = const()[name = string("op_6852"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_103_cast_fp16 = reshape(shape = var_6852, x = query_normed_35_cast_fp16)[name = string("mh_q_103_cast_fp16")];
+            tensor<int32, [4]> var_6854 = const()[name = string("op_6854"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_69_cast_fp16 = reshape(shape = var_6854, x = current_key_normed_35_cast_fp16)[name = string("mh_k_69_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6858_cast_fp16 = mul(x = mh_q_103_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6858_cast_fp16")];
+            tensor<int32, [4]> var_6863_begin_0 = const()[name = string("op_6863_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6863_end_0 = const()[name = string("op_6863_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_6863_end_mask_0 = const()[name = string("op_6863_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6863_cast_fp16 = slice_by_index(begin = var_6863_begin_0, end = var_6863_end_0, end_mask = var_6863_end_mask_0, x = mh_q_103_cast_fp16)[name = string("op_6863_cast_fp16")];
+            tensor<int32, [4]> var_6869_begin_0 = const()[name = string("op_6869_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6869_end_0 = const()[name = string("op_6869_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_6869_end_mask_0 = const()[name = string("op_6869_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6869_cast_fp16 = slice_by_index(begin = var_6869_begin_0, end = var_6869_end_0, end_mask = var_6869_end_mask_0, x = mh_q_103_cast_fp16)[name = string("op_6869_cast_fp16")];
+            fp16 const_408_promoted_to_fp16 = const()[name = string("const_408_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_6871_cast_fp16 = mul(x = var_6869_cast_fp16, y = const_408_promoted_to_fp16)[name = string("op_6871_cast_fp16")];
+            bool var_6873_interleave_0 = const()[name = string("op_6873_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_6873_cast_fp16 = concat(axis = var_6751, interleave = var_6873_interleave_0, values = (var_6871_cast_fp16, var_6863_cast_fp16))[name = string("op_6873_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6874_cast_fp16 = mul(x = var_6873_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6874_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_105_cast_fp16 = add(x = var_6858_cast_fp16, y = var_6874_cast_fp16)[name = string("mh_q_105_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6876_cast_fp16 = mul(x = mh_k_69_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6876_cast_fp16")];
+            tensor<int32, [4]> var_6881_begin_0 = const()[name = string("op_6881_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6881_end_0 = const()[name = string("op_6881_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_6881_end_mask_0 = const()[name = string("op_6881_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6881_cast_fp16 = slice_by_index(begin = var_6881_begin_0, end = var_6881_end_0, end_mask = var_6881_end_mask_0, x = mh_k_69_cast_fp16)[name = string("op_6881_cast_fp16")];
+            tensor<int32, [4]> var_6887_begin_0 = const()[name = string("op_6887_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6887_end_0 = const()[name = string("op_6887_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_6887_end_mask_0 = const()[name = string("op_6887_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6887_cast_fp16 = slice_by_index(begin = var_6887_begin_0, end = var_6887_end_0, end_mask = var_6887_end_mask_0, x = mh_k_69_cast_fp16)[name = string("op_6887_cast_fp16")];
+            fp16 const_411_promoted_to_fp16 = const()[name = string("const_411_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_6889_cast_fp16 = mul(x = var_6887_cast_fp16, y = const_411_promoted_to_fp16)[name = string("op_6889_cast_fp16")];
+            bool var_6891_interleave_0 = const()[name = string("op_6891_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_6891_cast_fp16 = concat(axis = var_6751, interleave = var_6891_interleave_0, values = (var_6889_cast_fp16, var_6881_cast_fp16))[name = string("op_6891_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6892_cast_fp16 = mul(x = var_6891_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6892_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_71_cast_fp16 = add(x = var_6876_cast_fp16, y = var_6892_cast_fp16)[name = string("mh_k_71_cast_fp16")];
+            tensor<int32, [4]> var_6896 = const()[name = string("op_6896"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_71_cast_fp16 = reshape(shape = var_6896, x = mh_k_71_cast_fp16)[name = string("current_key_71_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6903_cast_fp16 = mul(x = var_101_cast_fp16_17, y = var_323_cast_fp16)[name = string("op_6903_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6904_cast_fp16 = mul(x = current_key_71_cast_fp16, y = var_321_cast_fp16)[name = string("op_6904_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_105_cast_fp16 = add(x = var_6903_cast_fp16, y = var_6904_cast_fp16)[name = string("key_105_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6907_cast_fp16 = mul(x = var_132_cast_fp16_17, y = var_323_cast_fp16)[name = string("op_6907_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6908_cast_fp16 = mul(x = current_value_35_cast_fp16, y = var_321_cast_fp16)[name = string("op_6908_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_69_cast_fp16 = add(x = var_6907_cast_fp16, y = var_6908_cast_fp16)[name = string("value_69_cast_fp16")];
+            tensor<int32, [4]> var_6912 = const()[name = string("op_6912"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_69_cast_fp16 = reshape(shape = var_6912, x = key_105_cast_fp16)[name = string("key_heads_69_cast_fp16")];
+            tensor<int32, [4]> var_6914 = const()[name = string("op_6914"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_69_cast_fp16 = reshape(shape = var_6914, x = value_69_cast_fp16)[name = string("value_heads_69_cast_fp16")];
+            tensor<int32, [4]> var_6917_begin_0 = const()[name = string("op_6917_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6917_end_0 = const()[name = string("op_6917_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6917_end_mask_0 = const()[name = string("op_6917_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6917_cast_fp16 = slice_by_index(begin = var_6917_begin_0, end = var_6917_end_0, end_mask = var_6917_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6917_cast_fp16")];
+            tensor<int32, [4]> var_6921_begin_0 = const()[name = string("op_6921_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6921_end_0 = const()[name = string("op_6921_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6921_end_mask_0 = const()[name = string("op_6921_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6921_cast_fp16 = slice_by_index(begin = var_6921_begin_0, end = var_6921_end_0, end_mask = var_6921_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6921_cast_fp16")];
+            tensor<int32, [4]> var_6933_begin_0 = const()[name = string("op_6933_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6933_end_0 = const()[name = string("op_6933_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6933_end_mask_0 = const()[name = string("op_6933_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6933_cast_fp16 = slice_by_index(begin = var_6933_begin_0, end = var_6933_end_0, end_mask = var_6933_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6933_cast_fp16")];
+            tensor<int32, [4]> var_6937_begin_0 = const()[name = string("op_6937_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6937_end_0 = const()[name = string("op_6937_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6937_end_mask_0 = const()[name = string("op_6937_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6937_cast_fp16 = slice_by_index(begin = var_6937_begin_0, end = var_6937_end_0, end_mask = var_6937_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6937_cast_fp16")];
+            tensor<int32, [4]> var_6949_begin_0 = const()[name = string("op_6949_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6949_end_0 = const()[name = string("op_6949_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6949_end_mask_0 = const()[name = string("op_6949_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6949_cast_fp16 = slice_by_index(begin = var_6949_begin_0, end = var_6949_end_0, end_mask = var_6949_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6949_cast_fp16")];
+            tensor<int32, [4]> var_6953_begin_0 = const()[name = string("op_6953_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6953_end_0 = const()[name = string("op_6953_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6953_end_mask_0 = const()[name = string("op_6953_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6953_cast_fp16 = slice_by_index(begin = var_6953_begin_0, end = var_6953_end_0, end_mask = var_6953_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6953_cast_fp16")];
+            tensor<int32, [4]> var_6965_begin_0 = const()[name = string("op_6965_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6965_end_0 = const()[name = string("op_6965_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6965_end_mask_0 = const()[name = string("op_6965_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6965_cast_fp16 = slice_by_index(begin = var_6965_begin_0, end = var_6965_end_0, end_mask = var_6965_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6965_cast_fp16")];
+            tensor<int32, [4]> var_6969_begin_0 = const()[name = string("op_6969_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6969_end_0 = const()[name = string("op_6969_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6969_end_mask_0 = const()[name = string("op_6969_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6969_cast_fp16 = slice_by_index(begin = var_6969_begin_0, end = var_6969_end_0, end_mask = var_6969_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6969_cast_fp16")];
+            tensor<int32, [4]> var_6981_begin_0 = const()[name = string("op_6981_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6981_end_0 = const()[name = string("op_6981_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6981_end_mask_0 = const()[name = string("op_6981_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6981_cast_fp16 = slice_by_index(begin = var_6981_begin_0, end = var_6981_end_0, end_mask = var_6981_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6981_cast_fp16")];
+            tensor<int32, [4]> var_6985_begin_0 = const()[name = string("op_6985_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6985_end_0 = const()[name = string("op_6985_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6985_end_mask_0 = const()[name = string("op_6985_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6985_cast_fp16 = slice_by_index(begin = var_6985_begin_0, end = var_6985_end_0, end_mask = var_6985_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6985_cast_fp16")];
+            tensor<int32, [4]> var_6997_begin_0 = const()[name = string("op_6997_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6997_end_0 = const()[name = string("op_6997_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6997_end_mask_0 = const()[name = string("op_6997_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6997_cast_fp16 = slice_by_index(begin = var_6997_begin_0, end = var_6997_end_0, end_mask = var_6997_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6997_cast_fp16")];
+            tensor<int32, [4]> var_7001_begin_0 = const()[name = string("op_7001_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7001_end_0 = const()[name = string("op_7001_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7001_end_mask_0 = const()[name = string("op_7001_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7001_cast_fp16 = slice_by_index(begin = var_7001_begin_0, end = var_7001_end_0, end_mask = var_7001_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_7001_cast_fp16")];
+            tensor<int32, [4]> var_7013_begin_0 = const()[name = string("op_7013_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7013_end_0 = const()[name = string("op_7013_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7013_end_mask_0 = const()[name = string("op_7013_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7013_cast_fp16 = slice_by_index(begin = var_7013_begin_0, end = var_7013_end_0, end_mask = var_7013_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_7013_cast_fp16")];
+            tensor<int32, [4]> var_7017_begin_0 = const()[name = string("op_7017_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7017_end_0 = const()[name = string("op_7017_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7017_end_mask_0 = const()[name = string("op_7017_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7017_cast_fp16 = slice_by_index(begin = var_7017_begin_0, end = var_7017_end_0, end_mask = var_7017_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_7017_cast_fp16")];
+            tensor<int32, [4]> var_7029_begin_0 = const()[name = string("op_7029_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7029_end_0 = const()[name = string("op_7029_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7029_end_mask_0 = const()[name = string("op_7029_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7029_cast_fp16 = slice_by_index(begin = var_7029_begin_0, end = var_7029_end_0, end_mask = var_7029_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_7029_cast_fp16")];
+            tensor<int32, [4]> var_7033_begin_0 = const()[name = string("op_7033_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7033_end_0 = const()[name = string("op_7033_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7033_end_mask_0 = const()[name = string("op_7033_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7033_cast_fp16 = slice_by_index(begin = var_7033_begin_0, end = var_7033_end_0, end_mask = var_7033_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_7033_cast_fp16")];
+            bool key_heads_71_interleave_0 = const()[name = string("key_heads_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_71_cast_fp16 = concat(axis = var_6759, interleave = key_heads_71_interleave_0, values = (var_6917_cast_fp16, var_6917_cast_fp16, var_6933_cast_fp16, var_6933_cast_fp16, var_6949_cast_fp16, var_6949_cast_fp16, var_6965_cast_fp16, var_6965_cast_fp16, var_6981_cast_fp16, var_6981_cast_fp16, var_6997_cast_fp16, var_6997_cast_fp16, var_7013_cast_fp16, var_7013_cast_fp16, var_7029_cast_fp16, var_7029_cast_fp16))[name = string("key_heads_71_cast_fp16")];
+            bool value_heads_71_interleave_0 = const()[name = string("value_heads_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_71_cast_fp16 = concat(axis = var_6759, interleave = value_heads_71_interleave_0, values = (var_6921_cast_fp16, var_6921_cast_fp16, var_6937_cast_fp16, var_6937_cast_fp16, var_6953_cast_fp16, var_6953_cast_fp16, var_6969_cast_fp16, var_6969_cast_fp16, var_6985_cast_fp16, var_6985_cast_fp16, var_7001_cast_fp16, var_7001_cast_fp16, var_7017_cast_fp16, var_7017_cast_fp16, var_7033_cast_fp16, var_7033_cast_fp16))[name = string("value_heads_71_cast_fp16")];
+            fp16 var_7056_to_fp16 = const()[name = string("op_7056_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_7057_cast_fp16 = mul(x = mh_q_105_cast_fp16, y = var_7056_to_fp16)[name = string("op_7057_cast_fp16")];
+            bool mh_w_69_transpose_x_0 = const()[name = string("mh_w_69_transpose_x_0"), val = bool(true)];
+            bool mh_w_69_transpose_y_0 = const()[name = string("mh_w_69_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_69_cast_fp16 = matmul(transpose_x = mh_w_69_transpose_x_0, transpose_y = mh_w_69_transpose_y_0, x = var_7057_cast_fp16, y = key_heads_71_cast_fp16)[name = string("mh_w_69_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_71_cast_fp16 = add(x = mh_w_69_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_71_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_7069_cast_fp16 = softmax(axis = var_6741, x = mh_w_71_cast_fp16)[name = string("op_7069_cast_fp16")];
+            bool attn_35_transpose_x_0 = const()[name = string("attn_35_transpose_x_0"), val = bool(false)];
+            bool attn_35_transpose_y_0 = const()[name = string("attn_35_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_35_cast_fp16 = matmul(transpose_x = attn_35_transpose_x_0, transpose_y = attn_35_transpose_y_0, x = value_heads_71_cast_fp16, y = var_7069_cast_fp16)[name = string("attn_35_cast_fp16")];
+            tensor<int32, [4]> var_7074 = const()[name = string("op_7074"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_137_cast_fp16 = reshape(shape = var_7074, x = attn_35_cast_fp16)[name = string("input_137_cast_fp16")];
+            string obj_147_pad_type_0 = const()[name = string("obj_147_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_147_strides_0 = const()[name = string("obj_147_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_147_pad_0 = const()[name = string("obj_147_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_147_dilations_0 = const()[name = string("obj_147_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_147_groups_0 = const()[name = string("obj_147_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_17_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(271818240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(273915456))))[name = string("layers_17_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_147_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_147_dilations_0, groups = obj_147_groups_0, pad = obj_147_pad_0, pad_type = obj_147_pad_type_0, strides = obj_147_strides_0, weight = layers_17_self_attn_o_proj_weight_to_fp16_palettized, x = input_137_cast_fp16)[name = string("obj_147_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_141_cast_fp16 = add(x = inputs_135_cast_fp16, y = obj_147_cast_fp16)[name = string("inputs_141_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_143_cast_fp16 = mul(x = inputs_141_cast_fp16, y = inputs_141_cast_fp16)[name = string("inputs_sq_143_cast_fp16")];
+            tensor<int32, [1]> variance_143_axes_0 = const()[name = string("variance_143_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_143_keep_dims_0 = const()[name = string("variance_143_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_143_cast_fp16 = reduce_mean(axes = variance_143_axes_0, keep_dims = variance_143_keep_dims_0, x = inputs_sq_143_cast_fp16)[name = string("variance_143_cast_fp16")];
+            fp16 var_7092_to_fp16 = const()[name = string("op_7092_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7093_cast_fp16 = add(x = variance_143_cast_fp16, y = var_7092_to_fp16)[name = string("op_7093_cast_fp16")];
+            fp32 var_7094_epsilon_0 = const()[name = string("op_7094_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7094_cast_fp16 = rsqrt(epsilon = var_7094_epsilon_0, x = var_7093_cast_fp16)[name = string("op_7094_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_177_cast_fp16 = mul(x = inputs_141_cast_fp16, y = var_7094_cast_fp16)[name = string("hidden_states_177_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_143_to_fp16 = const()[name = string("w_143_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(273916032)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_139_cast_fp16 = mul(x = w_143_to_fp16, y = hidden_states_177_cast_fp16)[name = string("input_139_cast_fp16")];
+            string input_141_pad_type_0 = const()[name = string("input_141_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_141_strides_0 = const()[name = string("input_141_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_141_pad_0 = const()[name = string("input_141_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_141_dilations_0 = const()[name = string("input_141_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_141_groups_0 = const()[name = string("input_141_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_17_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(273918144))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(277063936))))[name = string("layers_17_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_141_cast_fp16 = conv(dilations = input_141_dilations_0, groups = input_141_groups_0, pad = input_141_pad_0, pad_type = input_141_pad_type_0, strides = input_141_strides_0, weight = layers_17_mlp_gate_proj_weight_to_fp16_palettized, x = input_139_cast_fp16)[name = string("input_141_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_7108_cast_fp16 = silu(x = input_141_cast_fp16)[name = string("op_7108_cast_fp16")];
+            string var_7114_pad_type_0 = const()[name = string("op_7114_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7114_strides_0 = const()[name = string("op_7114_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7114_pad_0 = const()[name = string("op_7114_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7114_dilations_0 = const()[name = string("op_7114_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7114_groups_0 = const()[name = string("op_7114_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_17_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(277064512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(280210304))))[name = string("layers_17_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_7114_cast_fp16 = conv(dilations = var_7114_dilations_0, groups = var_7114_groups_0, pad = var_7114_pad_0, pad_type = var_7114_pad_type_0, strides = var_7114_strides_0, weight = layers_17_mlp_up_proj_weight_to_fp16_palettized, x = input_139_cast_fp16)[name = string("op_7114_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_143_cast_fp16 = mul(x = var_7108_cast_fp16, y = var_7114_cast_fp16)[name = string("input_143_cast_fp16")];
+            string hidden_states_179_pad_type_0 = const()[name = string("hidden_states_179_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_179_strides_0 = const()[name = string("hidden_states_179_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_179_pad_0 = const()[name = string("hidden_states_179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_179_dilations_0 = const()[name = string("hidden_states_179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_179_groups_0 = const()[name = string("hidden_states_179_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_17_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(280210880))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283356672))))[name = string("layers_17_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_179_cast_fp16 = conv(dilations = hidden_states_179_dilations_0, groups = hidden_states_179_groups_0, pad = hidden_states_179_pad_0, pad_type = hidden_states_179_pad_type_0, strides = hidden_states_179_strides_0, weight = layers_17_mlp_down_proj_weight_to_fp16_palettized, x = input_143_cast_fp16)[name = string("hidden_states_179_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_143_cast_fp16 = add(x = inputs_141_cast_fp16, y = hidden_states_179_cast_fp16)[name = string("inputs_143_cast_fp16")];
+            int32 var_7128 = const()[name = string("op_7128"), val = int32(3)];
+            int32 var_7138 = const()[name = string("op_7138"), val = int32(-2)];
+            int32 var_7146 = const()[name = string("op_7146"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_145_cast_fp16 = mul(x = inputs_143_cast_fp16, y = inputs_143_cast_fp16)[name = string("inputs_sq_145_cast_fp16")];
+            tensor<int32, [1]> variance_145_axes_0 = const()[name = string("variance_145_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_145_keep_dims_0 = const()[name = string("variance_145_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_145_cast_fp16 = reduce_mean(axes = variance_145_axes_0, keep_dims = variance_145_keep_dims_0, x = inputs_sq_145_cast_fp16)[name = string("variance_145_cast_fp16")];
+            fp16 var_7158_to_fp16 = const()[name = string("op_7158_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7159_cast_fp16 = add(x = variance_145_cast_fp16, y = var_7158_to_fp16)[name = string("op_7159_cast_fp16")];
+            fp32 var_7160_epsilon_0 = const()[name = string("op_7160_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7160_cast_fp16 = rsqrt(epsilon = var_7160_epsilon_0, x = var_7159_cast_fp16)[name = string("op_7160_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_181_cast_fp16 = mul(x = inputs_143_cast_fp16, y = var_7160_cast_fp16)[name = string("hidden_states_181_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_145_to_fp16 = const()[name = string("w_145_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283357248)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_149_cast_fp16 = mul(x = w_145_to_fp16, y = hidden_states_181_cast_fp16)[name = string("obj_149_cast_fp16")];
+            string query_109_pad_type_0 = const()[name = string("query_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_109_strides_0 = const()[name = string("query_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_109_pad_0 = const()[name = string("query_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_109_dilations_0 = const()[name = string("query_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_109_groups_0 = const()[name = string("query_109_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_18_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283359360))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285456576))))[name = string("layers_18_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_109_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_109_dilations_0, groups = query_109_groups_0, pad = query_109_pad_0, pad_type = query_109_pad_type_0, strides = query_109_strides_0, weight = layers_18_self_attn_q_proj_weight_to_fp16_palettized, x = obj_149_cast_fp16)[name = string("query_109_cast_fp16")];
+            string current_key_73_pad_type_0 = const()[name = string("current_key_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_73_strides_0 = const()[name = string("current_key_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_73_pad_0 = const()[name = string("current_key_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_73_dilations_0 = const()[name = string("current_key_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_73_groups_0 = const()[name = string("current_key_73_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285457152))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(286505792))))[name = string("layers_18_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_73_cast_fp16 = conv(dilations = current_key_73_dilations_0, groups = current_key_73_groups_0, pad = current_key_73_pad_0, pad_type = current_key_73_pad_type_0, strides = current_key_73_strides_0, weight = layers_18_self_attn_k_proj_weight_to_fp16_palettized, x = obj_149_cast_fp16)[name = string("current_key_73_cast_fp16")];
+            string current_value_37_pad_type_0 = const()[name = string("current_value_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_37_strides_0 = const()[name = string("current_value_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_37_pad_0 = const()[name = string("current_value_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_37_dilations_0 = const()[name = string("current_value_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_37_groups_0 = const()[name = string("current_value_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(286506368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(287555008))))[name = string("layers_18_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_37_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_37_dilations_0, groups = current_value_37_groups_0, pad = current_value_37_pad_0, pad_type = current_value_37_pad_type_0, strides = current_value_37_strides_0, weight = layers_18_self_attn_v_proj_weight_to_fp16_palettized, x = obj_149_cast_fp16)[name = string("current_value_37_cast_fp16")];
+            tensor<int32, [4]> var_7197 = const()[name = string("op_7197"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_145_cast_fp16 = reshape(shape = var_7197, x = query_109_cast_fp16)[name = string("inputs_145_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_147_cast_fp16 = mul(x = inputs_145_cast_fp16, y = inputs_145_cast_fp16)[name = string("inputs_sq_147_cast_fp16")];
+            tensor<int32, [1]> variance_147_axes_0 = const()[name = string("variance_147_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_147_keep_dims_0 = const()[name = string("variance_147_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_147_cast_fp16 = reduce_mean(axes = variance_147_axes_0, keep_dims = variance_147_keep_dims_0, x = inputs_sq_147_cast_fp16)[name = string("variance_147_cast_fp16")];
+            fp16 var_7203_to_fp16 = const()[name = string("op_7203_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_7204_cast_fp16 = add(x = variance_147_cast_fp16, y = var_7203_to_fp16)[name = string("op_7204_cast_fp16")];
+            fp32 var_7205_epsilon_0 = const()[name = string("op_7205_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_7205_cast_fp16 = rsqrt(epsilon = var_7205_epsilon_0, x = var_7204_cast_fp16)[name = string("op_7205_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_183_cast_fp16 = mul(x = inputs_145_cast_fp16, y = var_7205_cast_fp16)[name = string("hidden_states_183_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_147_to_fp16 = const()[name = string("w_147_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(287555584)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_37_cast_fp16 = mul(x = w_147_to_fp16, y = hidden_states_183_cast_fp16)[name = string("query_normed_37_cast_fp16")];
+            tensor<int32, [4]> var_7213 = const()[name = string("op_7213"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_147_cast_fp16 = reshape(shape = var_7213, x = current_key_73_cast_fp16)[name = string("inputs_147_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_149_cast_fp16 = mul(x = inputs_147_cast_fp16, y = inputs_147_cast_fp16)[name = string("inputs_sq_149_cast_fp16")];
+            tensor<int32, [1]> variance_149_axes_0 = const()[name = string("variance_149_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_149_keep_dims_0 = const()[name = string("variance_149_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_149_cast_fp16 = reduce_mean(axes = variance_149_axes_0, keep_dims = variance_149_keep_dims_0, x = inputs_sq_149_cast_fp16)[name = string("variance_149_cast_fp16")];
+            fp16 var_7219_to_fp16 = const()[name = string("op_7219_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_7220_cast_fp16 = add(x = variance_149_cast_fp16, y = var_7219_to_fp16)[name = string("op_7220_cast_fp16")];
+            fp32 var_7221_epsilon_0 = const()[name = string("op_7221_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_7221_cast_fp16 = rsqrt(epsilon = var_7221_epsilon_0, x = var_7220_cast_fp16)[name = string("op_7221_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_185_cast_fp16 = mul(x = inputs_147_cast_fp16, y = var_7221_cast_fp16)[name = string("hidden_states_185_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_149_to_fp16 = const()[name = string("w_149_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(287555904)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_37_cast_fp16 = mul(x = w_149_to_fp16, y = hidden_states_185_cast_fp16)[name = string("current_key_normed_37_cast_fp16")];
+            tensor<int32, [4]> var_7239 = const()[name = string("op_7239"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_109_cast_fp16 = reshape(shape = var_7239, x = query_normed_37_cast_fp16)[name = string("mh_q_109_cast_fp16")];
+            tensor<int32, [4]> var_7241 = const()[name = string("op_7241"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_73_cast_fp16 = reshape(shape = var_7241, x = current_key_normed_37_cast_fp16)[name = string("mh_k_73_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7245_cast_fp16 = mul(x = mh_q_109_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7245_cast_fp16")];
+            tensor<int32, [4]> var_7250_begin_0 = const()[name = string("op_7250_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7250_end_0 = const()[name = string("op_7250_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_7250_end_mask_0 = const()[name = string("op_7250_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7250_cast_fp16 = slice_by_index(begin = var_7250_begin_0, end = var_7250_end_0, end_mask = var_7250_end_mask_0, x = mh_q_109_cast_fp16)[name = string("op_7250_cast_fp16")];
+            tensor<int32, [4]> var_7256_begin_0 = const()[name = string("op_7256_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7256_end_0 = const()[name = string("op_7256_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_7256_end_mask_0 = const()[name = string("op_7256_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7256_cast_fp16 = slice_by_index(begin = var_7256_begin_0, end = var_7256_end_0, end_mask = var_7256_end_mask_0, x = mh_q_109_cast_fp16)[name = string("op_7256_cast_fp16")];
+            fp16 const_431_promoted_to_fp16 = const()[name = string("const_431_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_7258_cast_fp16 = mul(x = var_7256_cast_fp16, y = const_431_promoted_to_fp16)[name = string("op_7258_cast_fp16")];
+            bool var_7260_interleave_0 = const()[name = string("op_7260_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_7260_cast_fp16 = concat(axis = var_7138, interleave = var_7260_interleave_0, values = (var_7258_cast_fp16, var_7250_cast_fp16))[name = string("op_7260_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7261_cast_fp16 = mul(x = var_7260_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7261_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_111_cast_fp16 = add(x = var_7245_cast_fp16, y = var_7261_cast_fp16)[name = string("mh_q_111_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7263_cast_fp16 = mul(x = mh_k_73_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7263_cast_fp16")];
+            tensor<int32, [4]> var_7268_begin_0 = const()[name = string("op_7268_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7268_end_0 = const()[name = string("op_7268_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_7268_end_mask_0 = const()[name = string("op_7268_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7268_cast_fp16 = slice_by_index(begin = var_7268_begin_0, end = var_7268_end_0, end_mask = var_7268_end_mask_0, x = mh_k_73_cast_fp16)[name = string("op_7268_cast_fp16")];
+            tensor<int32, [4]> var_7274_begin_0 = const()[name = string("op_7274_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7274_end_0 = const()[name = string("op_7274_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_7274_end_mask_0 = const()[name = string("op_7274_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7274_cast_fp16 = slice_by_index(begin = var_7274_begin_0, end = var_7274_end_0, end_mask = var_7274_end_mask_0, x = mh_k_73_cast_fp16)[name = string("op_7274_cast_fp16")];
+            fp16 const_434_promoted_to_fp16 = const()[name = string("const_434_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_7276_cast_fp16 = mul(x = var_7274_cast_fp16, y = const_434_promoted_to_fp16)[name = string("op_7276_cast_fp16")];
+            bool var_7278_interleave_0 = const()[name = string("op_7278_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_7278_cast_fp16 = concat(axis = var_7138, interleave = var_7278_interleave_0, values = (var_7276_cast_fp16, var_7268_cast_fp16))[name = string("op_7278_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7279_cast_fp16 = mul(x = var_7278_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7279_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_75_cast_fp16 = add(x = var_7263_cast_fp16, y = var_7279_cast_fp16)[name = string("mh_k_75_cast_fp16")];
+            tensor<int32, [4]> var_7283 = const()[name = string("op_7283"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_75_cast_fp16 = reshape(shape = var_7283, x = mh_k_75_cast_fp16)[name = string("current_key_75_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7290_cast_fp16 = mul(x = var_101_cast_fp16_18, y = var_323_cast_fp16)[name = string("op_7290_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7291_cast_fp16 = mul(x = current_key_75_cast_fp16, y = var_321_cast_fp16)[name = string("op_7291_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_111_cast_fp16 = add(x = var_7290_cast_fp16, y = var_7291_cast_fp16)[name = string("key_111_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7294_cast_fp16 = mul(x = var_132_cast_fp16_18, y = var_323_cast_fp16)[name = string("op_7294_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7295_cast_fp16 = mul(x = current_value_37_cast_fp16, y = var_321_cast_fp16)[name = string("op_7295_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_73_cast_fp16 = add(x = var_7294_cast_fp16, y = var_7295_cast_fp16)[name = string("value_73_cast_fp16")];
+            tensor<int32, [4]> var_7299 = const()[name = string("op_7299"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_73_cast_fp16 = reshape(shape = var_7299, x = key_111_cast_fp16)[name = string("key_heads_73_cast_fp16")];
+            tensor<int32, [4]> var_7301 = const()[name = string("op_7301"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_73_cast_fp16 = reshape(shape = var_7301, x = value_73_cast_fp16)[name = string("value_heads_73_cast_fp16")];
+            tensor<int32, [4]> var_7304_begin_0 = const()[name = string("op_7304_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7304_end_0 = const()[name = string("op_7304_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7304_end_mask_0 = const()[name = string("op_7304_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7304_cast_fp16 = slice_by_index(begin = var_7304_begin_0, end = var_7304_end_0, end_mask = var_7304_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7304_cast_fp16")];
+            tensor<int32, [4]> var_7308_begin_0 = const()[name = string("op_7308_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7308_end_0 = const()[name = string("op_7308_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7308_end_mask_0 = const()[name = string("op_7308_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7308_cast_fp16 = slice_by_index(begin = var_7308_begin_0, end = var_7308_end_0, end_mask = var_7308_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7308_cast_fp16")];
+            tensor<int32, [4]> var_7320_begin_0 = const()[name = string("op_7320_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7320_end_0 = const()[name = string("op_7320_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7320_end_mask_0 = const()[name = string("op_7320_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7320_cast_fp16 = slice_by_index(begin = var_7320_begin_0, end = var_7320_end_0, end_mask = var_7320_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7320_cast_fp16")];
+            tensor<int32, [4]> var_7324_begin_0 = const()[name = string("op_7324_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7324_end_0 = const()[name = string("op_7324_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7324_end_mask_0 = const()[name = string("op_7324_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7324_cast_fp16 = slice_by_index(begin = var_7324_begin_0, end = var_7324_end_0, end_mask = var_7324_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7324_cast_fp16")];
+            tensor<int32, [4]> var_7336_begin_0 = const()[name = string("op_7336_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7336_end_0 = const()[name = string("op_7336_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7336_end_mask_0 = const()[name = string("op_7336_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7336_cast_fp16 = slice_by_index(begin = var_7336_begin_0, end = var_7336_end_0, end_mask = var_7336_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7336_cast_fp16")];
+            tensor<int32, [4]> var_7340_begin_0 = const()[name = string("op_7340_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7340_end_0 = const()[name = string("op_7340_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7340_end_mask_0 = const()[name = string("op_7340_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7340_cast_fp16 = slice_by_index(begin = var_7340_begin_0, end = var_7340_end_0, end_mask = var_7340_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7340_cast_fp16")];
+            tensor<int32, [4]> var_7352_begin_0 = const()[name = string("op_7352_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7352_end_0 = const()[name = string("op_7352_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7352_end_mask_0 = const()[name = string("op_7352_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7352_cast_fp16 = slice_by_index(begin = var_7352_begin_0, end = var_7352_end_0, end_mask = var_7352_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7352_cast_fp16")];
+            tensor<int32, [4]> var_7356_begin_0 = const()[name = string("op_7356_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7356_end_0 = const()[name = string("op_7356_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7356_end_mask_0 = const()[name = string("op_7356_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7356_cast_fp16 = slice_by_index(begin = var_7356_begin_0, end = var_7356_end_0, end_mask = var_7356_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7356_cast_fp16")];
+            tensor<int32, [4]> var_7368_begin_0 = const()[name = string("op_7368_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7368_end_0 = const()[name = string("op_7368_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7368_end_mask_0 = const()[name = string("op_7368_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7368_cast_fp16 = slice_by_index(begin = var_7368_begin_0, end = var_7368_end_0, end_mask = var_7368_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7368_cast_fp16")];
+            tensor<int32, [4]> var_7372_begin_0 = const()[name = string("op_7372_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7372_end_0 = const()[name = string("op_7372_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7372_end_mask_0 = const()[name = string("op_7372_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7372_cast_fp16 = slice_by_index(begin = var_7372_begin_0, end = var_7372_end_0, end_mask = var_7372_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7372_cast_fp16")];
+            tensor<int32, [4]> var_7384_begin_0 = const()[name = string("op_7384_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7384_end_0 = const()[name = string("op_7384_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7384_end_mask_0 = const()[name = string("op_7384_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7384_cast_fp16 = slice_by_index(begin = var_7384_begin_0, end = var_7384_end_0, end_mask = var_7384_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7384_cast_fp16")];
+            tensor<int32, [4]> var_7388_begin_0 = const()[name = string("op_7388_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7388_end_0 = const()[name = string("op_7388_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7388_end_mask_0 = const()[name = string("op_7388_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7388_cast_fp16 = slice_by_index(begin = var_7388_begin_0, end = var_7388_end_0, end_mask = var_7388_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7388_cast_fp16")];
+            tensor<int32, [4]> var_7400_begin_0 = const()[name = string("op_7400_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7400_end_0 = const()[name = string("op_7400_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7400_end_mask_0 = const()[name = string("op_7400_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7400_cast_fp16 = slice_by_index(begin = var_7400_begin_0, end = var_7400_end_0, end_mask = var_7400_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7400_cast_fp16")];
+            tensor<int32, [4]> var_7404_begin_0 = const()[name = string("op_7404_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7404_end_0 = const()[name = string("op_7404_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7404_end_mask_0 = const()[name = string("op_7404_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7404_cast_fp16 = slice_by_index(begin = var_7404_begin_0, end = var_7404_end_0, end_mask = var_7404_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7404_cast_fp16")];
+            tensor<int32, [4]> var_7416_begin_0 = const()[name = string("op_7416_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7416_end_0 = const()[name = string("op_7416_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7416_end_mask_0 = const()[name = string("op_7416_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7416_cast_fp16 = slice_by_index(begin = var_7416_begin_0, end = var_7416_end_0, end_mask = var_7416_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7416_cast_fp16")];
+            tensor<int32, [4]> var_7420_begin_0 = const()[name = string("op_7420_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7420_end_0 = const()[name = string("op_7420_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7420_end_mask_0 = const()[name = string("op_7420_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7420_cast_fp16 = slice_by_index(begin = var_7420_begin_0, end = var_7420_end_0, end_mask = var_7420_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7420_cast_fp16")];
+            bool key_heads_75_interleave_0 = const()[name = string("key_heads_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_75_cast_fp16 = concat(axis = var_7146, interleave = key_heads_75_interleave_0, values = (var_7304_cast_fp16, var_7304_cast_fp16, var_7320_cast_fp16, var_7320_cast_fp16, var_7336_cast_fp16, var_7336_cast_fp16, var_7352_cast_fp16, var_7352_cast_fp16, var_7368_cast_fp16, var_7368_cast_fp16, var_7384_cast_fp16, var_7384_cast_fp16, var_7400_cast_fp16, var_7400_cast_fp16, var_7416_cast_fp16, var_7416_cast_fp16))[name = string("key_heads_75_cast_fp16")];
+            bool value_heads_75_interleave_0 = const()[name = string("value_heads_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_75_cast_fp16 = concat(axis = var_7146, interleave = value_heads_75_interleave_0, values = (var_7308_cast_fp16, var_7308_cast_fp16, var_7324_cast_fp16, var_7324_cast_fp16, var_7340_cast_fp16, var_7340_cast_fp16, var_7356_cast_fp16, var_7356_cast_fp16, var_7372_cast_fp16, var_7372_cast_fp16, var_7388_cast_fp16, var_7388_cast_fp16, var_7404_cast_fp16, var_7404_cast_fp16, var_7420_cast_fp16, var_7420_cast_fp16))[name = string("value_heads_75_cast_fp16")];
+            fp16 var_7443_to_fp16 = const()[name = string("op_7443_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_7444_cast_fp16 = mul(x = mh_q_111_cast_fp16, y = var_7443_to_fp16)[name = string("op_7444_cast_fp16")];
+            bool mh_w_73_transpose_x_0 = const()[name = string("mh_w_73_transpose_x_0"), val = bool(true)];
+            bool mh_w_73_transpose_y_0 = const()[name = string("mh_w_73_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_73_cast_fp16 = matmul(transpose_x = mh_w_73_transpose_x_0, transpose_y = mh_w_73_transpose_y_0, x = var_7444_cast_fp16, y = key_heads_75_cast_fp16)[name = string("mh_w_73_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_75_cast_fp16 = add(x = mh_w_73_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_75_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_7456_cast_fp16 = softmax(axis = var_7128, x = mh_w_75_cast_fp16)[name = string("op_7456_cast_fp16")];
+            bool attn_37_transpose_x_0 = const()[name = string("attn_37_transpose_x_0"), val = bool(false)];
+            bool attn_37_transpose_y_0 = const()[name = string("attn_37_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_37_cast_fp16 = matmul(transpose_x = attn_37_transpose_x_0, transpose_y = attn_37_transpose_y_0, x = value_heads_75_cast_fp16, y = var_7456_cast_fp16)[name = string("attn_37_cast_fp16")];
+            tensor<int32, [4]> var_7461 = const()[name = string("op_7461"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_145_cast_fp16 = reshape(shape = var_7461, x = attn_37_cast_fp16)[name = string("input_145_cast_fp16")];
+            string obj_155_pad_type_0 = const()[name = string("obj_155_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_155_strides_0 = const()[name = string("obj_155_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_155_pad_0 = const()[name = string("obj_155_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_155_dilations_0 = const()[name = string("obj_155_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_155_groups_0 = const()[name = string("obj_155_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_18_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(287556224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(289653440))))[name = string("layers_18_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_155_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_155_dilations_0, groups = obj_155_groups_0, pad = obj_155_pad_0, pad_type = obj_155_pad_type_0, strides = obj_155_strides_0, weight = layers_18_self_attn_o_proj_weight_to_fp16_palettized, x = input_145_cast_fp16)[name = string("obj_155_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_149_cast_fp16 = add(x = inputs_143_cast_fp16, y = obj_155_cast_fp16)[name = string("inputs_149_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_151_cast_fp16 = mul(x = inputs_149_cast_fp16, y = inputs_149_cast_fp16)[name = string("inputs_sq_151_cast_fp16")];
+            tensor<int32, [1]> variance_151_axes_0 = const()[name = string("variance_151_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_151_keep_dims_0 = const()[name = string("variance_151_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_151_cast_fp16 = reduce_mean(axes = variance_151_axes_0, keep_dims = variance_151_keep_dims_0, x = inputs_sq_151_cast_fp16)[name = string("variance_151_cast_fp16")];
+            fp16 var_7479_to_fp16 = const()[name = string("op_7479_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7480_cast_fp16 = add(x = variance_151_cast_fp16, y = var_7479_to_fp16)[name = string("op_7480_cast_fp16")];
+            fp32 var_7481_epsilon_0 = const()[name = string("op_7481_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7481_cast_fp16 = rsqrt(epsilon = var_7481_epsilon_0, x = var_7480_cast_fp16)[name = string("op_7481_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_187_cast_fp16 = mul(x = inputs_149_cast_fp16, y = var_7481_cast_fp16)[name = string("hidden_states_187_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_151_to_fp16 = const()[name = string("w_151_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(289654016)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_147_cast_fp16 = mul(x = w_151_to_fp16, y = hidden_states_187_cast_fp16)[name = string("input_147_cast_fp16")];
+            string input_149_pad_type_0 = const()[name = string("input_149_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_149_strides_0 = const()[name = string("input_149_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_149_pad_0 = const()[name = string("input_149_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_149_dilations_0 = const()[name = string("input_149_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_149_groups_0 = const()[name = string("input_149_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_18_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(289656128))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(292801920))))[name = string("layers_18_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_149_cast_fp16 = conv(dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = layers_18_mlp_gate_proj_weight_to_fp16_palettized, x = input_147_cast_fp16)[name = string("input_149_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_7495_cast_fp16 = silu(x = input_149_cast_fp16)[name = string("op_7495_cast_fp16")];
+            string var_7501_pad_type_0 = const()[name = string("op_7501_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7501_strides_0 = const()[name = string("op_7501_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7501_pad_0 = const()[name = string("op_7501_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7501_dilations_0 = const()[name = string("op_7501_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7501_groups_0 = const()[name = string("op_7501_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_18_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(292802496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295948288))))[name = string("layers_18_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_7501_cast_fp16 = conv(dilations = var_7501_dilations_0, groups = var_7501_groups_0, pad = var_7501_pad_0, pad_type = var_7501_pad_type_0, strides = var_7501_strides_0, weight = layers_18_mlp_up_proj_weight_to_fp16_palettized, x = input_147_cast_fp16)[name = string("op_7501_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_151_cast_fp16 = mul(x = var_7495_cast_fp16, y = var_7501_cast_fp16)[name = string("input_151_cast_fp16")];
+            string hidden_states_189_pad_type_0 = const()[name = string("hidden_states_189_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_189_strides_0 = const()[name = string("hidden_states_189_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_189_pad_0 = const()[name = string("hidden_states_189_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_189_dilations_0 = const()[name = string("hidden_states_189_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_189_groups_0 = const()[name = string("hidden_states_189_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_18_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295948864))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(299094656))))[name = string("layers_18_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_189_cast_fp16 = conv(dilations = hidden_states_189_dilations_0, groups = hidden_states_189_groups_0, pad = hidden_states_189_pad_0, pad_type = hidden_states_189_pad_type_0, strides = hidden_states_189_strides_0, weight = layers_18_mlp_down_proj_weight_to_fp16_palettized, x = input_151_cast_fp16)[name = string("hidden_states_189_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_151_cast_fp16 = add(x = inputs_149_cast_fp16, y = hidden_states_189_cast_fp16)[name = string("inputs_151_cast_fp16")];
+            int32 var_7515 = const()[name = string("op_7515"), val = int32(3)];
+            int32 var_7525 = const()[name = string("op_7525"), val = int32(-2)];
+            int32 var_7533 = const()[name = string("op_7533"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_153_cast_fp16 = mul(x = inputs_151_cast_fp16, y = inputs_151_cast_fp16)[name = string("inputs_sq_153_cast_fp16")];
+            tensor<int32, [1]> variance_153_axes_0 = const()[name = string("variance_153_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_153_keep_dims_0 = const()[name = string("variance_153_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_153_cast_fp16 = reduce_mean(axes = variance_153_axes_0, keep_dims = variance_153_keep_dims_0, x = inputs_sq_153_cast_fp16)[name = string("variance_153_cast_fp16")];
+            fp16 var_7545_to_fp16 = const()[name = string("op_7545_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7546_cast_fp16 = add(x = variance_153_cast_fp16, y = var_7545_to_fp16)[name = string("op_7546_cast_fp16")];
+            fp32 var_7547_epsilon_0 = const()[name = string("op_7547_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7547_cast_fp16 = rsqrt(epsilon = var_7547_epsilon_0, x = var_7546_cast_fp16)[name = string("op_7547_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_191_cast_fp16 = mul(x = inputs_151_cast_fp16, y = var_7547_cast_fp16)[name = string("hidden_states_191_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_153_to_fp16 = const()[name = string("w_153_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(299095232)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_157_cast_fp16 = mul(x = w_153_to_fp16, y = hidden_states_191_cast_fp16)[name = string("obj_157_cast_fp16")];
+            string query_115_pad_type_0 = const()[name = string("query_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_115_strides_0 = const()[name = string("query_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_115_pad_0 = const()[name = string("query_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_115_dilations_0 = const()[name = string("query_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_115_groups_0 = const()[name = string("query_115_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_19_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(299097344))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301194560))))[name = string("layers_19_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_115_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_115_dilations_0, groups = query_115_groups_0, pad = query_115_pad_0, pad_type = query_115_pad_type_0, strides = query_115_strides_0, weight = layers_19_self_attn_q_proj_weight_to_fp16_palettized, x = obj_157_cast_fp16)[name = string("query_115_cast_fp16")];
+            string current_key_77_pad_type_0 = const()[name = string("current_key_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_77_strides_0 = const()[name = string("current_key_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_77_pad_0 = const()[name = string("current_key_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_77_dilations_0 = const()[name = string("current_key_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_77_groups_0 = const()[name = string("current_key_77_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301195136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302243776))))[name = string("layers_19_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_77_cast_fp16 = conv(dilations = current_key_77_dilations_0, groups = current_key_77_groups_0, pad = current_key_77_pad_0, pad_type = current_key_77_pad_type_0, strides = current_key_77_strides_0, weight = layers_19_self_attn_k_proj_weight_to_fp16_palettized, x = obj_157_cast_fp16)[name = string("current_key_77_cast_fp16")];
+            string current_value_39_pad_type_0 = const()[name = string("current_value_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_39_strides_0 = const()[name = string("current_value_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_39_pad_0 = const()[name = string("current_value_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_39_dilations_0 = const()[name = string("current_value_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_39_groups_0 = const()[name = string("current_value_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302244352))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303292992))))[name = string("layers_19_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_39_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_39_dilations_0, groups = current_value_39_groups_0, pad = current_value_39_pad_0, pad_type = current_value_39_pad_type_0, strides = current_value_39_strides_0, weight = layers_19_self_attn_v_proj_weight_to_fp16_palettized, x = obj_157_cast_fp16)[name = string("current_value_39_cast_fp16")];
+            tensor<int32, [4]> var_7584 = const()[name = string("op_7584"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_153_cast_fp16 = reshape(shape = var_7584, x = query_115_cast_fp16)[name = string("inputs_153_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_155_cast_fp16 = mul(x = inputs_153_cast_fp16, y = inputs_153_cast_fp16)[name = string("inputs_sq_155_cast_fp16")];
+            tensor<int32, [1]> variance_155_axes_0 = const()[name = string("variance_155_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_155_keep_dims_0 = const()[name = string("variance_155_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_155_cast_fp16 = reduce_mean(axes = variance_155_axes_0, keep_dims = variance_155_keep_dims_0, x = inputs_sq_155_cast_fp16)[name = string("variance_155_cast_fp16")];
+            fp16 var_7590_to_fp16 = const()[name = string("op_7590_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_7591_cast_fp16 = add(x = variance_155_cast_fp16, y = var_7590_to_fp16)[name = string("op_7591_cast_fp16")];
+            fp32 var_7592_epsilon_0 = const()[name = string("op_7592_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_7592_cast_fp16 = rsqrt(epsilon = var_7592_epsilon_0, x = var_7591_cast_fp16)[name = string("op_7592_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_193_cast_fp16 = mul(x = inputs_153_cast_fp16, y = var_7592_cast_fp16)[name = string("hidden_states_193_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_155_to_fp16 = const()[name = string("w_155_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303293568)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_39_cast_fp16 = mul(x = w_155_to_fp16, y = hidden_states_193_cast_fp16)[name = string("query_normed_39_cast_fp16")];
+            tensor<int32, [4]> var_7600 = const()[name = string("op_7600"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_155_cast_fp16 = reshape(shape = var_7600, x = current_key_77_cast_fp16)[name = string("inputs_155_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_157_cast_fp16 = mul(x = inputs_155_cast_fp16, y = inputs_155_cast_fp16)[name = string("inputs_sq_157_cast_fp16")];
+            tensor<int32, [1]> variance_157_axes_0 = const()[name = string("variance_157_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_157_keep_dims_0 = const()[name = string("variance_157_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_157_cast_fp16 = reduce_mean(axes = variance_157_axes_0, keep_dims = variance_157_keep_dims_0, x = inputs_sq_157_cast_fp16)[name = string("variance_157_cast_fp16")];
+            fp16 var_7606_to_fp16 = const()[name = string("op_7606_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_7607_cast_fp16 = add(x = variance_157_cast_fp16, y = var_7606_to_fp16)[name = string("op_7607_cast_fp16")];
+            fp32 var_7608_epsilon_0 = const()[name = string("op_7608_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_7608_cast_fp16 = rsqrt(epsilon = var_7608_epsilon_0, x = var_7607_cast_fp16)[name = string("op_7608_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_195_cast_fp16 = mul(x = inputs_155_cast_fp16, y = var_7608_cast_fp16)[name = string("hidden_states_195_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_157_to_fp16 = const()[name = string("w_157_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303293888)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_39_cast_fp16 = mul(x = w_157_to_fp16, y = hidden_states_195_cast_fp16)[name = string("current_key_normed_39_cast_fp16")];
+            tensor<int32, [4]> var_7626 = const()[name = string("op_7626"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_115_cast_fp16 = reshape(shape = var_7626, x = query_normed_39_cast_fp16)[name = string("mh_q_115_cast_fp16")];
+            tensor<int32, [4]> var_7628 = const()[name = string("op_7628"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_77_cast_fp16 = reshape(shape = var_7628, x = current_key_normed_39_cast_fp16)[name = string("mh_k_77_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7632_cast_fp16 = mul(x = mh_q_115_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7632_cast_fp16")];
+            tensor<int32, [4]> var_7637_begin_0 = const()[name = string("op_7637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7637_end_0 = const()[name = string("op_7637_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_7637_end_mask_0 = const()[name = string("op_7637_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7637_cast_fp16 = slice_by_index(begin = var_7637_begin_0, end = var_7637_end_0, end_mask = var_7637_end_mask_0, x = mh_q_115_cast_fp16)[name = string("op_7637_cast_fp16")];
+            tensor<int32, [4]> var_7643_begin_0 = const()[name = string("op_7643_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7643_end_0 = const()[name = string("op_7643_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_7643_end_mask_0 = const()[name = string("op_7643_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7643_cast_fp16 = slice_by_index(begin = var_7643_begin_0, end = var_7643_end_0, end_mask = var_7643_end_mask_0, x = mh_q_115_cast_fp16)[name = string("op_7643_cast_fp16")];
+            fp16 const_454_promoted_to_fp16 = const()[name = string("const_454_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_7645_cast_fp16 = mul(x = var_7643_cast_fp16, y = const_454_promoted_to_fp16)[name = string("op_7645_cast_fp16")];
+            bool var_7647_interleave_0 = const()[name = string("op_7647_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_7647_cast_fp16 = concat(axis = var_7525, interleave = var_7647_interleave_0, values = (var_7645_cast_fp16, var_7637_cast_fp16))[name = string("op_7647_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7648_cast_fp16 = mul(x = var_7647_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7648_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_117_cast_fp16 = add(x = var_7632_cast_fp16, y = var_7648_cast_fp16)[name = string("mh_q_117_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7650_cast_fp16 = mul(x = mh_k_77_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7650_cast_fp16")];
+            tensor<int32, [4]> var_7655_begin_0 = const()[name = string("op_7655_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7655_end_0 = const()[name = string("op_7655_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_7655_end_mask_0 = const()[name = string("op_7655_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7655_cast_fp16 = slice_by_index(begin = var_7655_begin_0, end = var_7655_end_0, end_mask = var_7655_end_mask_0, x = mh_k_77_cast_fp16)[name = string("op_7655_cast_fp16")];
+            tensor<int32, [4]> var_7661_begin_0 = const()[name = string("op_7661_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7661_end_0 = const()[name = string("op_7661_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_7661_end_mask_0 = const()[name = string("op_7661_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7661_cast_fp16 = slice_by_index(begin = var_7661_begin_0, end = var_7661_end_0, end_mask = var_7661_end_mask_0, x = mh_k_77_cast_fp16)[name = string("op_7661_cast_fp16")];
+            fp16 const_457_promoted_to_fp16 = const()[name = string("const_457_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_7663_cast_fp16 = mul(x = var_7661_cast_fp16, y = const_457_promoted_to_fp16)[name = string("op_7663_cast_fp16")];
+            bool var_7665_interleave_0 = const()[name = string("op_7665_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_7665_cast_fp16 = concat(axis = var_7525, interleave = var_7665_interleave_0, values = (var_7663_cast_fp16, var_7655_cast_fp16))[name = string("op_7665_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7666_cast_fp16 = mul(x = var_7665_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7666_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_79_cast_fp16 = add(x = var_7650_cast_fp16, y = var_7666_cast_fp16)[name = string("mh_k_79_cast_fp16")];
+            tensor<int32, [4]> var_7670 = const()[name = string("op_7670"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_79_cast_fp16 = reshape(shape = var_7670, x = mh_k_79_cast_fp16)[name = string("current_key_79_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7677_cast_fp16 = mul(x = var_101_cast_fp16_19, y = var_323_cast_fp16)[name = string("op_7677_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7678_cast_fp16 = mul(x = current_key_79_cast_fp16, y = var_321_cast_fp16)[name = string("op_7678_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_117_cast_fp16 = add(x = var_7677_cast_fp16, y = var_7678_cast_fp16)[name = string("key_117_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7681_cast_fp16 = mul(x = var_132_cast_fp16_19, y = var_323_cast_fp16)[name = string("op_7681_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7682_cast_fp16 = mul(x = current_value_39_cast_fp16, y = var_321_cast_fp16)[name = string("op_7682_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_77_cast_fp16 = add(x = var_7681_cast_fp16, y = var_7682_cast_fp16)[name = string("value_77_cast_fp16")];
+            tensor<int32, [4]> var_7686 = const()[name = string("op_7686"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_77_cast_fp16 = reshape(shape = var_7686, x = key_117_cast_fp16)[name = string("key_heads_77_cast_fp16")];
+            tensor<int32, [4]> var_7688 = const()[name = string("op_7688"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_77_cast_fp16 = reshape(shape = var_7688, x = value_77_cast_fp16)[name = string("value_heads_77_cast_fp16")];
+            tensor<int32, [4]> var_7691_begin_0 = const()[name = string("op_7691_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7691_end_0 = const()[name = string("op_7691_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7691_end_mask_0 = const()[name = string("op_7691_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7691_cast_fp16 = slice_by_index(begin = var_7691_begin_0, end = var_7691_end_0, end_mask = var_7691_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7691_cast_fp16")];
+            tensor<int32, [4]> var_7695_begin_0 = const()[name = string("op_7695_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7695_end_0 = const()[name = string("op_7695_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7695_end_mask_0 = const()[name = string("op_7695_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7695_cast_fp16 = slice_by_index(begin = var_7695_begin_0, end = var_7695_end_0, end_mask = var_7695_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7695_cast_fp16")];
+            tensor<int32, [4]> var_7707_begin_0 = const()[name = string("op_7707_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7707_end_0 = const()[name = string("op_7707_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7707_end_mask_0 = const()[name = string("op_7707_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7707_cast_fp16 = slice_by_index(begin = var_7707_begin_0, end = var_7707_end_0, end_mask = var_7707_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7707_cast_fp16")];
+            tensor<int32, [4]> var_7711_begin_0 = const()[name = string("op_7711_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7711_end_0 = const()[name = string("op_7711_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7711_end_mask_0 = const()[name = string("op_7711_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7711_cast_fp16 = slice_by_index(begin = var_7711_begin_0, end = var_7711_end_0, end_mask = var_7711_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7711_cast_fp16")];
+            tensor<int32, [4]> var_7723_begin_0 = const()[name = string("op_7723_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7723_end_0 = const()[name = string("op_7723_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7723_end_mask_0 = const()[name = string("op_7723_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7723_cast_fp16 = slice_by_index(begin = var_7723_begin_0, end = var_7723_end_0, end_mask = var_7723_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7723_cast_fp16")];
+            tensor<int32, [4]> var_7727_begin_0 = const()[name = string("op_7727_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7727_end_0 = const()[name = string("op_7727_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7727_end_mask_0 = const()[name = string("op_7727_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7727_cast_fp16 = slice_by_index(begin = var_7727_begin_0, end = var_7727_end_0, end_mask = var_7727_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7727_cast_fp16")];
+            tensor<int32, [4]> var_7739_begin_0 = const()[name = string("op_7739_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7739_end_0 = const()[name = string("op_7739_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7739_end_mask_0 = const()[name = string("op_7739_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7739_cast_fp16 = slice_by_index(begin = var_7739_begin_0, end = var_7739_end_0, end_mask = var_7739_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7739_cast_fp16")];
+            tensor<int32, [4]> var_7743_begin_0 = const()[name = string("op_7743_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7743_end_0 = const()[name = string("op_7743_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7743_end_mask_0 = const()[name = string("op_7743_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7743_cast_fp16 = slice_by_index(begin = var_7743_begin_0, end = var_7743_end_0, end_mask = var_7743_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7743_cast_fp16")];
+            tensor<int32, [4]> var_7755_begin_0 = const()[name = string("op_7755_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7755_end_0 = const()[name = string("op_7755_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7755_end_mask_0 = const()[name = string("op_7755_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7755_cast_fp16 = slice_by_index(begin = var_7755_begin_0, end = var_7755_end_0, end_mask = var_7755_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7755_cast_fp16")];
+            tensor<int32, [4]> var_7759_begin_0 = const()[name = string("op_7759_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7759_end_0 = const()[name = string("op_7759_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7759_end_mask_0 = const()[name = string("op_7759_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7759_cast_fp16 = slice_by_index(begin = var_7759_begin_0, end = var_7759_end_0, end_mask = var_7759_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7759_cast_fp16")];
+            tensor<int32, [4]> var_7771_begin_0 = const()[name = string("op_7771_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7771_end_0 = const()[name = string("op_7771_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7771_end_mask_0 = const()[name = string("op_7771_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7771_cast_fp16 = slice_by_index(begin = var_7771_begin_0, end = var_7771_end_0, end_mask = var_7771_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7771_cast_fp16")];
+            tensor<int32, [4]> var_7775_begin_0 = const()[name = string("op_7775_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7775_end_0 = const()[name = string("op_7775_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7775_end_mask_0 = const()[name = string("op_7775_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7775_cast_fp16 = slice_by_index(begin = var_7775_begin_0, end = var_7775_end_0, end_mask = var_7775_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7775_cast_fp16")];
+            tensor<int32, [4]> var_7787_begin_0 = const()[name = string("op_7787_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7787_end_0 = const()[name = string("op_7787_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7787_end_mask_0 = const()[name = string("op_7787_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7787_cast_fp16 = slice_by_index(begin = var_7787_begin_0, end = var_7787_end_0, end_mask = var_7787_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7787_cast_fp16")];
+            tensor<int32, [4]> var_7791_begin_0 = const()[name = string("op_7791_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7791_end_0 = const()[name = string("op_7791_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7791_end_mask_0 = const()[name = string("op_7791_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7791_cast_fp16 = slice_by_index(begin = var_7791_begin_0, end = var_7791_end_0, end_mask = var_7791_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7791_cast_fp16")];
+            tensor<int32, [4]> var_7803_begin_0 = const()[name = string("op_7803_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7803_end_0 = const()[name = string("op_7803_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7803_end_mask_0 = const()[name = string("op_7803_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7803_cast_fp16 = slice_by_index(begin = var_7803_begin_0, end = var_7803_end_0, end_mask = var_7803_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7803_cast_fp16")];
+            tensor<int32, [4]> var_7807_begin_0 = const()[name = string("op_7807_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7807_end_0 = const()[name = string("op_7807_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7807_end_mask_0 = const()[name = string("op_7807_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7807_cast_fp16 = slice_by_index(begin = var_7807_begin_0, end = var_7807_end_0, end_mask = var_7807_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7807_cast_fp16")];
+            bool key_heads_79_interleave_0 = const()[name = string("key_heads_79_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_79_cast_fp16 = concat(axis = var_7533, interleave = key_heads_79_interleave_0, values = (var_7691_cast_fp16, var_7691_cast_fp16, var_7707_cast_fp16, var_7707_cast_fp16, var_7723_cast_fp16, var_7723_cast_fp16, var_7739_cast_fp16, var_7739_cast_fp16, var_7755_cast_fp16, var_7755_cast_fp16, var_7771_cast_fp16, var_7771_cast_fp16, var_7787_cast_fp16, var_7787_cast_fp16, var_7803_cast_fp16, var_7803_cast_fp16))[name = string("key_heads_79_cast_fp16")];
+            bool value_heads_79_interleave_0 = const()[name = string("value_heads_79_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_79_cast_fp16 = concat(axis = var_7533, interleave = value_heads_79_interleave_0, values = (var_7695_cast_fp16, var_7695_cast_fp16, var_7711_cast_fp16, var_7711_cast_fp16, var_7727_cast_fp16, var_7727_cast_fp16, var_7743_cast_fp16, var_7743_cast_fp16, var_7759_cast_fp16, var_7759_cast_fp16, var_7775_cast_fp16, var_7775_cast_fp16, var_7791_cast_fp16, var_7791_cast_fp16, var_7807_cast_fp16, var_7807_cast_fp16))[name = string("value_heads_79_cast_fp16")];
+            fp16 var_7830_to_fp16 = const()[name = string("op_7830_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_7831_cast_fp16 = mul(x = mh_q_117_cast_fp16, y = var_7830_to_fp16)[name = string("op_7831_cast_fp16")];
+            bool mh_w_77_transpose_x_0 = const()[name = string("mh_w_77_transpose_x_0"), val = bool(true)];
+            bool mh_w_77_transpose_y_0 = const()[name = string("mh_w_77_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_77_cast_fp16 = matmul(transpose_x = mh_w_77_transpose_x_0, transpose_y = mh_w_77_transpose_y_0, x = var_7831_cast_fp16, y = key_heads_79_cast_fp16)[name = string("mh_w_77_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_79_cast_fp16 = add(x = mh_w_77_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_79_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_7843_cast_fp16 = softmax(axis = var_7515, x = mh_w_79_cast_fp16)[name = string("op_7843_cast_fp16")];
+            bool attn_39_transpose_x_0 = const()[name = string("attn_39_transpose_x_0"), val = bool(false)];
+            bool attn_39_transpose_y_0 = const()[name = string("attn_39_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_39_cast_fp16 = matmul(transpose_x = attn_39_transpose_x_0, transpose_y = attn_39_transpose_y_0, x = value_heads_79_cast_fp16, y = var_7843_cast_fp16)[name = string("attn_39_cast_fp16")];
+            tensor<int32, [4]> var_7848 = const()[name = string("op_7848"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_153_cast_fp16 = reshape(shape = var_7848, x = attn_39_cast_fp16)[name = string("input_153_cast_fp16")];
+            string obj_163_pad_type_0 = const()[name = string("obj_163_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_163_strides_0 = const()[name = string("obj_163_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_163_pad_0 = const()[name = string("obj_163_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_163_dilations_0 = const()[name = string("obj_163_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_163_groups_0 = const()[name = string("obj_163_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_19_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(303294208))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(305391424))))[name = string("layers_19_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_163_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_163_dilations_0, groups = obj_163_groups_0, pad = obj_163_pad_0, pad_type = obj_163_pad_type_0, strides = obj_163_strides_0, weight = layers_19_self_attn_o_proj_weight_to_fp16_palettized, x = input_153_cast_fp16)[name = string("obj_163_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_157_cast_fp16 = add(x = inputs_151_cast_fp16, y = obj_163_cast_fp16)[name = string("inputs_157_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_159_cast_fp16 = mul(x = inputs_157_cast_fp16, y = inputs_157_cast_fp16)[name = string("inputs_sq_159_cast_fp16")];
+            tensor<int32, [1]> variance_159_axes_0 = const()[name = string("variance_159_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_159_keep_dims_0 = const()[name = string("variance_159_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_159_cast_fp16 = reduce_mean(axes = variance_159_axes_0, keep_dims = variance_159_keep_dims_0, x = inputs_sq_159_cast_fp16)[name = string("variance_159_cast_fp16")];
+            fp16 var_7866_to_fp16 = const()[name = string("op_7866_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7867_cast_fp16 = add(x = variance_159_cast_fp16, y = var_7866_to_fp16)[name = string("op_7867_cast_fp16")];
+            fp32 var_7868_epsilon_0 = const()[name = string("op_7868_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7868_cast_fp16 = rsqrt(epsilon = var_7868_epsilon_0, x = var_7867_cast_fp16)[name = string("op_7868_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_197_cast_fp16 = mul(x = inputs_157_cast_fp16, y = var_7868_cast_fp16)[name = string("hidden_states_197_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_159_to_fp16 = const()[name = string("w_159_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(305392000)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_155_cast_fp16 = mul(x = w_159_to_fp16, y = hidden_states_197_cast_fp16)[name = string("input_155_cast_fp16")];
+            string input_157_pad_type_0 = const()[name = string("input_157_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_157_strides_0 = const()[name = string("input_157_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_157_pad_0 = const()[name = string("input_157_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_157_dilations_0 = const()[name = string("input_157_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_157_groups_0 = const()[name = string("input_157_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_19_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(305394112))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308539904))))[name = string("layers_19_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_157_cast_fp16 = conv(dilations = input_157_dilations_0, groups = input_157_groups_0, pad = input_157_pad_0, pad_type = input_157_pad_type_0, strides = input_157_strides_0, weight = layers_19_mlp_gate_proj_weight_to_fp16_palettized, x = input_155_cast_fp16)[name = string("input_157_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_7882_cast_fp16 = silu(x = input_157_cast_fp16)[name = string("op_7882_cast_fp16")];
+            string var_7888_pad_type_0 = const()[name = string("op_7888_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7888_strides_0 = const()[name = string("op_7888_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7888_pad_0 = const()[name = string("op_7888_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7888_dilations_0 = const()[name = string("op_7888_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7888_groups_0 = const()[name = string("op_7888_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_19_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308540480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311686272))))[name = string("layers_19_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_7888_cast_fp16 = conv(dilations = var_7888_dilations_0, groups = var_7888_groups_0, pad = var_7888_pad_0, pad_type = var_7888_pad_type_0, strides = var_7888_strides_0, weight = layers_19_mlp_up_proj_weight_to_fp16_palettized, x = input_155_cast_fp16)[name = string("op_7888_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_159_cast_fp16 = mul(x = var_7882_cast_fp16, y = var_7888_cast_fp16)[name = string("input_159_cast_fp16")];
+            string hidden_states_199_pad_type_0 = const()[name = string("hidden_states_199_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_199_strides_0 = const()[name = string("hidden_states_199_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_199_pad_0 = const()[name = string("hidden_states_199_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_199_dilations_0 = const()[name = string("hidden_states_199_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_199_groups_0 = const()[name = string("hidden_states_199_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_19_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311686848))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314832640))))[name = string("layers_19_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_199_cast_fp16 = conv(dilations = hidden_states_199_dilations_0, groups = hidden_states_199_groups_0, pad = hidden_states_199_pad_0, pad_type = hidden_states_199_pad_type_0, strides = hidden_states_199_strides_0, weight = layers_19_mlp_down_proj_weight_to_fp16_palettized, x = input_159_cast_fp16)[name = string("hidden_states_199_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_159_cast_fp16 = add(x = inputs_157_cast_fp16, y = hidden_states_199_cast_fp16)[name = string("inputs_159_cast_fp16")];
+            int32 var_7902 = const()[name = string("op_7902"), val = int32(3)];
+            int32 var_7912 = const()[name = string("op_7912"), val = int32(-2)];
+            int32 var_7920 = const()[name = string("op_7920"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_161_cast_fp16 = mul(x = inputs_159_cast_fp16, y = inputs_159_cast_fp16)[name = string("inputs_sq_161_cast_fp16")];
+            tensor<int32, [1]> variance_161_axes_0 = const()[name = string("variance_161_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_161_keep_dims_0 = const()[name = string("variance_161_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_161_cast_fp16 = reduce_mean(axes = variance_161_axes_0, keep_dims = variance_161_keep_dims_0, x = inputs_sq_161_cast_fp16)[name = string("variance_161_cast_fp16")];
+            fp16 var_7932_to_fp16 = const()[name = string("op_7932_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7933_cast_fp16 = add(x = variance_161_cast_fp16, y = var_7932_to_fp16)[name = string("op_7933_cast_fp16")];
+            fp32 var_7934_epsilon_0 = const()[name = string("op_7934_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7934_cast_fp16 = rsqrt(epsilon = var_7934_epsilon_0, x = var_7933_cast_fp16)[name = string("op_7934_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_201_cast_fp16 = mul(x = inputs_159_cast_fp16, y = var_7934_cast_fp16)[name = string("hidden_states_201_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_161_to_fp16 = const()[name = string("w_161_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314833216)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_165_cast_fp16 = mul(x = w_161_to_fp16, y = hidden_states_201_cast_fp16)[name = string("obj_165_cast_fp16")];
+            string query_121_pad_type_0 = const()[name = string("query_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_121_strides_0 = const()[name = string("query_121_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_121_pad_0 = const()[name = string("query_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_121_dilations_0 = const()[name = string("query_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_121_groups_0 = const()[name = string("query_121_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_20_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314835328))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(316932544))))[name = string("layers_20_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_121_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_121_dilations_0, groups = query_121_groups_0, pad = query_121_pad_0, pad_type = query_121_pad_type_0, strides = query_121_strides_0, weight = layers_20_self_attn_q_proj_weight_to_fp16_palettized, x = obj_165_cast_fp16)[name = string("query_121_cast_fp16")];
+            string current_key_81_pad_type_0 = const()[name = string("current_key_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_81_strides_0 = const()[name = string("current_key_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_81_pad_0 = const()[name = string("current_key_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_81_dilations_0 = const()[name = string("current_key_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_81_groups_0 = const()[name = string("current_key_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(316933120))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(317981760))))[name = string("layers_20_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_81_cast_fp16 = conv(dilations = current_key_81_dilations_0, groups = current_key_81_groups_0, pad = current_key_81_pad_0, pad_type = current_key_81_pad_type_0, strides = current_key_81_strides_0, weight = layers_20_self_attn_k_proj_weight_to_fp16_palettized, x = obj_165_cast_fp16)[name = string("current_key_81_cast_fp16")];
+            string current_value_41_pad_type_0 = const()[name = string("current_value_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_41_strides_0 = const()[name = string("current_value_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_41_pad_0 = const()[name = string("current_value_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_41_dilations_0 = const()[name = string("current_value_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_41_groups_0 = const()[name = string("current_value_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(317982336))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319030976))))[name = string("layers_20_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_41_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_41_dilations_0, groups = current_value_41_groups_0, pad = current_value_41_pad_0, pad_type = current_value_41_pad_type_0, strides = current_value_41_strides_0, weight = layers_20_self_attn_v_proj_weight_to_fp16_palettized, x = obj_165_cast_fp16)[name = string("current_value_41_cast_fp16")];
+            tensor<int32, [4]> var_7971 = const()[name = string("op_7971"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_161_cast_fp16 = reshape(shape = var_7971, x = query_121_cast_fp16)[name = string("inputs_161_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_163_cast_fp16 = mul(x = inputs_161_cast_fp16, y = inputs_161_cast_fp16)[name = string("inputs_sq_163_cast_fp16")];
+            tensor<int32, [1]> variance_163_axes_0 = const()[name = string("variance_163_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_163_keep_dims_0 = const()[name = string("variance_163_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_163_cast_fp16 = reduce_mean(axes = variance_163_axes_0, keep_dims = variance_163_keep_dims_0, x = inputs_sq_163_cast_fp16)[name = string("variance_163_cast_fp16")];
+            fp16 var_7977_to_fp16 = const()[name = string("op_7977_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_7978_cast_fp16 = add(x = variance_163_cast_fp16, y = var_7977_to_fp16)[name = string("op_7978_cast_fp16")];
+            fp32 var_7979_epsilon_0 = const()[name = string("op_7979_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_7979_cast_fp16 = rsqrt(epsilon = var_7979_epsilon_0, x = var_7978_cast_fp16)[name = string("op_7979_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_203_cast_fp16 = mul(x = inputs_161_cast_fp16, y = var_7979_cast_fp16)[name = string("hidden_states_203_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_163_to_fp16 = const()[name = string("w_163_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319031552)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_41_cast_fp16 = mul(x = w_163_to_fp16, y = hidden_states_203_cast_fp16)[name = string("query_normed_41_cast_fp16")];
+            tensor<int32, [4]> var_7987 = const()[name = string("op_7987"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_163_cast_fp16 = reshape(shape = var_7987, x = current_key_81_cast_fp16)[name = string("inputs_163_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_165_cast_fp16 = mul(x = inputs_163_cast_fp16, y = inputs_163_cast_fp16)[name = string("inputs_sq_165_cast_fp16")];
+            tensor<int32, [1]> variance_165_axes_0 = const()[name = string("variance_165_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_165_keep_dims_0 = const()[name = string("variance_165_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_165_cast_fp16 = reduce_mean(axes = variance_165_axes_0, keep_dims = variance_165_keep_dims_0, x = inputs_sq_165_cast_fp16)[name = string("variance_165_cast_fp16")];
+            fp16 var_7993_to_fp16 = const()[name = string("op_7993_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_7994_cast_fp16 = add(x = variance_165_cast_fp16, y = var_7993_to_fp16)[name = string("op_7994_cast_fp16")];
+            fp32 var_7995_epsilon_0 = const()[name = string("op_7995_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_7995_cast_fp16 = rsqrt(epsilon = var_7995_epsilon_0, x = var_7994_cast_fp16)[name = string("op_7995_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_205_cast_fp16 = mul(x = inputs_163_cast_fp16, y = var_7995_cast_fp16)[name = string("hidden_states_205_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_165_to_fp16 = const()[name = string("w_165_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319031872)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_41_cast_fp16 = mul(x = w_165_to_fp16, y = hidden_states_205_cast_fp16)[name = string("current_key_normed_41_cast_fp16")];
+            tensor<int32, [4]> var_8013 = const()[name = string("op_8013"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_121_cast_fp16 = reshape(shape = var_8013, x = query_normed_41_cast_fp16)[name = string("mh_q_121_cast_fp16")];
+            tensor<int32, [4]> var_8015 = const()[name = string("op_8015"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_81_cast_fp16 = reshape(shape = var_8015, x = current_key_normed_41_cast_fp16)[name = string("mh_k_81_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8019_cast_fp16 = mul(x = mh_q_121_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8019_cast_fp16")];
+            tensor<int32, [4]> var_8024_begin_0 = const()[name = string("op_8024_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8024_end_0 = const()[name = string("op_8024_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_8024_end_mask_0 = const()[name = string("op_8024_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8024_cast_fp16 = slice_by_index(begin = var_8024_begin_0, end = var_8024_end_0, end_mask = var_8024_end_mask_0, x = mh_q_121_cast_fp16)[name = string("op_8024_cast_fp16")];
+            tensor<int32, [4]> var_8030_begin_0 = const()[name = string("op_8030_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8030_end_0 = const()[name = string("op_8030_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_8030_end_mask_0 = const()[name = string("op_8030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8030_cast_fp16 = slice_by_index(begin = var_8030_begin_0, end = var_8030_end_0, end_mask = var_8030_end_mask_0, x = mh_q_121_cast_fp16)[name = string("op_8030_cast_fp16")];
+            fp16 const_477_promoted_to_fp16 = const()[name = string("const_477_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_8032_cast_fp16 = mul(x = var_8030_cast_fp16, y = const_477_promoted_to_fp16)[name = string("op_8032_cast_fp16")];
+            bool var_8034_interleave_0 = const()[name = string("op_8034_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_8034_cast_fp16 = concat(axis = var_7912, interleave = var_8034_interleave_0, values = (var_8032_cast_fp16, var_8024_cast_fp16))[name = string("op_8034_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8035_cast_fp16 = mul(x = var_8034_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8035_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_123_cast_fp16 = add(x = var_8019_cast_fp16, y = var_8035_cast_fp16)[name = string("mh_q_123_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8037_cast_fp16 = mul(x = mh_k_81_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8037_cast_fp16")];
+            tensor<int32, [4]> var_8042_begin_0 = const()[name = string("op_8042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8042_end_0 = const()[name = string("op_8042_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_8042_end_mask_0 = const()[name = string("op_8042_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8042_cast_fp16 = slice_by_index(begin = var_8042_begin_0, end = var_8042_end_0, end_mask = var_8042_end_mask_0, x = mh_k_81_cast_fp16)[name = string("op_8042_cast_fp16")];
+            tensor<int32, [4]> var_8048_begin_0 = const()[name = string("op_8048_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8048_end_0 = const()[name = string("op_8048_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_8048_end_mask_0 = const()[name = string("op_8048_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8048_cast_fp16 = slice_by_index(begin = var_8048_begin_0, end = var_8048_end_0, end_mask = var_8048_end_mask_0, x = mh_k_81_cast_fp16)[name = string("op_8048_cast_fp16")];
+            fp16 const_480_promoted_to_fp16 = const()[name = string("const_480_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_8050_cast_fp16 = mul(x = var_8048_cast_fp16, y = const_480_promoted_to_fp16)[name = string("op_8050_cast_fp16")];
+            bool var_8052_interleave_0 = const()[name = string("op_8052_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_8052_cast_fp16 = concat(axis = var_7912, interleave = var_8052_interleave_0, values = (var_8050_cast_fp16, var_8042_cast_fp16))[name = string("op_8052_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8053_cast_fp16 = mul(x = var_8052_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8053_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_83_cast_fp16 = add(x = var_8037_cast_fp16, y = var_8053_cast_fp16)[name = string("mh_k_83_cast_fp16")];
+            tensor<int32, [4]> var_8057 = const()[name = string("op_8057"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_83_cast_fp16 = reshape(shape = var_8057, x = mh_k_83_cast_fp16)[name = string("current_key_83_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8064_cast_fp16 = mul(x = var_101_cast_fp16_20, y = var_323_cast_fp16)[name = string("op_8064_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8065_cast_fp16 = mul(x = current_key_83_cast_fp16, y = var_321_cast_fp16)[name = string("op_8065_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_123_cast_fp16 = add(x = var_8064_cast_fp16, y = var_8065_cast_fp16)[name = string("key_123_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8068_cast_fp16 = mul(x = var_132_cast_fp16_20, y = var_323_cast_fp16)[name = string("op_8068_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8069_cast_fp16 = mul(x = current_value_41_cast_fp16, y = var_321_cast_fp16)[name = string("op_8069_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_81_cast_fp16 = add(x = var_8068_cast_fp16, y = var_8069_cast_fp16)[name = string("value_81_cast_fp16")];
+            tensor<int32, [4]> var_8073 = const()[name = string("op_8073"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_81_cast_fp16 = reshape(shape = var_8073, x = key_123_cast_fp16)[name = string("key_heads_81_cast_fp16")];
+            tensor<int32, [4]> var_8075 = const()[name = string("op_8075"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_81_cast_fp16 = reshape(shape = var_8075, x = value_81_cast_fp16)[name = string("value_heads_81_cast_fp16")];
+            tensor<int32, [4]> var_8078_begin_0 = const()[name = string("op_8078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8078_end_0 = const()[name = string("op_8078_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8078_end_mask_0 = const()[name = string("op_8078_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8078_cast_fp16 = slice_by_index(begin = var_8078_begin_0, end = var_8078_end_0, end_mask = var_8078_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8078_cast_fp16")];
+            tensor<int32, [4]> var_8082_begin_0 = const()[name = string("op_8082_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8082_end_0 = const()[name = string("op_8082_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8082_end_mask_0 = const()[name = string("op_8082_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8082_cast_fp16 = slice_by_index(begin = var_8082_begin_0, end = var_8082_end_0, end_mask = var_8082_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8082_cast_fp16")];
+            tensor<int32, [4]> var_8094_begin_0 = const()[name = string("op_8094_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8094_end_0 = const()[name = string("op_8094_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8094_end_mask_0 = const()[name = string("op_8094_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8094_cast_fp16 = slice_by_index(begin = var_8094_begin_0, end = var_8094_end_0, end_mask = var_8094_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8094_cast_fp16")];
+            tensor<int32, [4]> var_8098_begin_0 = const()[name = string("op_8098_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8098_end_0 = const()[name = string("op_8098_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8098_end_mask_0 = const()[name = string("op_8098_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8098_cast_fp16 = slice_by_index(begin = var_8098_begin_0, end = var_8098_end_0, end_mask = var_8098_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8098_cast_fp16")];
+            tensor<int32, [4]> var_8110_begin_0 = const()[name = string("op_8110_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8110_end_0 = const()[name = string("op_8110_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8110_end_mask_0 = const()[name = string("op_8110_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8110_cast_fp16 = slice_by_index(begin = var_8110_begin_0, end = var_8110_end_0, end_mask = var_8110_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8110_cast_fp16")];
+            tensor<int32, [4]> var_8114_begin_0 = const()[name = string("op_8114_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8114_end_0 = const()[name = string("op_8114_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8114_end_mask_0 = const()[name = string("op_8114_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8114_cast_fp16 = slice_by_index(begin = var_8114_begin_0, end = var_8114_end_0, end_mask = var_8114_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8114_cast_fp16")];
+            tensor<int32, [4]> var_8126_begin_0 = const()[name = string("op_8126_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8126_end_0 = const()[name = string("op_8126_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8126_end_mask_0 = const()[name = string("op_8126_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8126_cast_fp16 = slice_by_index(begin = var_8126_begin_0, end = var_8126_end_0, end_mask = var_8126_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8126_cast_fp16")];
+            tensor<int32, [4]> var_8130_begin_0 = const()[name = string("op_8130_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8130_end_0 = const()[name = string("op_8130_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8130_end_mask_0 = const()[name = string("op_8130_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8130_cast_fp16 = slice_by_index(begin = var_8130_begin_0, end = var_8130_end_0, end_mask = var_8130_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8130_cast_fp16")];
+            tensor<int32, [4]> var_8142_begin_0 = const()[name = string("op_8142_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8142_end_0 = const()[name = string("op_8142_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8142_end_mask_0 = const()[name = string("op_8142_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8142_cast_fp16 = slice_by_index(begin = var_8142_begin_0, end = var_8142_end_0, end_mask = var_8142_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8142_cast_fp16")];
+            tensor<int32, [4]> var_8146_begin_0 = const()[name = string("op_8146_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8146_end_0 = const()[name = string("op_8146_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8146_end_mask_0 = const()[name = string("op_8146_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8146_cast_fp16 = slice_by_index(begin = var_8146_begin_0, end = var_8146_end_0, end_mask = var_8146_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8146_cast_fp16")];
+            tensor<int32, [4]> var_8158_begin_0 = const()[name = string("op_8158_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8158_end_0 = const()[name = string("op_8158_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8158_end_mask_0 = const()[name = string("op_8158_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8158_cast_fp16 = slice_by_index(begin = var_8158_begin_0, end = var_8158_end_0, end_mask = var_8158_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8158_cast_fp16")];
+            tensor<int32, [4]> var_8162_begin_0 = const()[name = string("op_8162_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8162_end_0 = const()[name = string("op_8162_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8162_end_mask_0 = const()[name = string("op_8162_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8162_cast_fp16 = slice_by_index(begin = var_8162_begin_0, end = var_8162_end_0, end_mask = var_8162_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8162_cast_fp16")];
+            tensor<int32, [4]> var_8174_begin_0 = const()[name = string("op_8174_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8174_end_0 = const()[name = string("op_8174_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8174_end_mask_0 = const()[name = string("op_8174_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8174_cast_fp16 = slice_by_index(begin = var_8174_begin_0, end = var_8174_end_0, end_mask = var_8174_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8174_cast_fp16")];
+            tensor<int32, [4]> var_8178_begin_0 = const()[name = string("op_8178_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8178_end_0 = const()[name = string("op_8178_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8178_end_mask_0 = const()[name = string("op_8178_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8178_cast_fp16 = slice_by_index(begin = var_8178_begin_0, end = var_8178_end_0, end_mask = var_8178_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8178_cast_fp16")];
+            tensor<int32, [4]> var_8190_begin_0 = const()[name = string("op_8190_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8190_end_0 = const()[name = string("op_8190_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8190_end_mask_0 = const()[name = string("op_8190_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8190_cast_fp16 = slice_by_index(begin = var_8190_begin_0, end = var_8190_end_0, end_mask = var_8190_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8190_cast_fp16")];
+            tensor<int32, [4]> var_8194_begin_0 = const()[name = string("op_8194_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8194_end_0 = const()[name = string("op_8194_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8194_end_mask_0 = const()[name = string("op_8194_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8194_cast_fp16 = slice_by_index(begin = var_8194_begin_0, end = var_8194_end_0, end_mask = var_8194_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8194_cast_fp16")];
+            bool key_heads_83_interleave_0 = const()[name = string("key_heads_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_83_cast_fp16 = concat(axis = var_7920, interleave = key_heads_83_interleave_0, values = (var_8078_cast_fp16, var_8078_cast_fp16, var_8094_cast_fp16, var_8094_cast_fp16, var_8110_cast_fp16, var_8110_cast_fp16, var_8126_cast_fp16, var_8126_cast_fp16, var_8142_cast_fp16, var_8142_cast_fp16, var_8158_cast_fp16, var_8158_cast_fp16, var_8174_cast_fp16, var_8174_cast_fp16, var_8190_cast_fp16, var_8190_cast_fp16))[name = string("key_heads_83_cast_fp16")];
+            bool value_heads_83_interleave_0 = const()[name = string("value_heads_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_83_cast_fp16 = concat(axis = var_7920, interleave = value_heads_83_interleave_0, values = (var_8082_cast_fp16, var_8082_cast_fp16, var_8098_cast_fp16, var_8098_cast_fp16, var_8114_cast_fp16, var_8114_cast_fp16, var_8130_cast_fp16, var_8130_cast_fp16, var_8146_cast_fp16, var_8146_cast_fp16, var_8162_cast_fp16, var_8162_cast_fp16, var_8178_cast_fp16, var_8178_cast_fp16, var_8194_cast_fp16, var_8194_cast_fp16))[name = string("value_heads_83_cast_fp16")];
+            fp16 var_8217_to_fp16 = const()[name = string("op_8217_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_8218_cast_fp16 = mul(x = mh_q_123_cast_fp16, y = var_8217_to_fp16)[name = string("op_8218_cast_fp16")];
+            bool mh_w_81_transpose_x_0 = const()[name = string("mh_w_81_transpose_x_0"), val = bool(true)];
+            bool mh_w_81_transpose_y_0 = const()[name = string("mh_w_81_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_81_cast_fp16 = matmul(transpose_x = mh_w_81_transpose_x_0, transpose_y = mh_w_81_transpose_y_0, x = var_8218_cast_fp16, y = key_heads_83_cast_fp16)[name = string("mh_w_81_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_83_cast_fp16 = add(x = mh_w_81_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_83_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_8230_cast_fp16 = softmax(axis = var_7902, x = mh_w_83_cast_fp16)[name = string("op_8230_cast_fp16")];
+            bool attn_41_transpose_x_0 = const()[name = string("attn_41_transpose_x_0"), val = bool(false)];
+            bool attn_41_transpose_y_0 = const()[name = string("attn_41_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_41_cast_fp16 = matmul(transpose_x = attn_41_transpose_x_0, transpose_y = attn_41_transpose_y_0, x = value_heads_83_cast_fp16, y = var_8230_cast_fp16)[name = string("attn_41_cast_fp16")];
+            tensor<int32, [4]> var_8235 = const()[name = string("op_8235"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_161_cast_fp16 = reshape(shape = var_8235, x = attn_41_cast_fp16)[name = string("input_161_cast_fp16")];
+            string obj_171_pad_type_0 = const()[name = string("obj_171_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_171_strides_0 = const()[name = string("obj_171_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_171_pad_0 = const()[name = string("obj_171_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_171_dilations_0 = const()[name = string("obj_171_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_171_groups_0 = const()[name = string("obj_171_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_20_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319032192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(321129408))))[name = string("layers_20_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_171_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_171_dilations_0, groups = obj_171_groups_0, pad = obj_171_pad_0, pad_type = obj_171_pad_type_0, strides = obj_171_strides_0, weight = layers_20_self_attn_o_proj_weight_to_fp16_palettized, x = input_161_cast_fp16)[name = string("obj_171_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_165_cast_fp16 = add(x = inputs_159_cast_fp16, y = obj_171_cast_fp16)[name = string("inputs_165_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_167_cast_fp16 = mul(x = inputs_165_cast_fp16, y = inputs_165_cast_fp16)[name = string("inputs_sq_167_cast_fp16")];
+            tensor<int32, [1]> variance_167_axes_0 = const()[name = string("variance_167_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_167_keep_dims_0 = const()[name = string("variance_167_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_167_cast_fp16 = reduce_mean(axes = variance_167_axes_0, keep_dims = variance_167_keep_dims_0, x = inputs_sq_167_cast_fp16)[name = string("variance_167_cast_fp16")];
+            fp16 var_8253_to_fp16 = const()[name = string("op_8253_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8254_cast_fp16 = add(x = variance_167_cast_fp16, y = var_8253_to_fp16)[name = string("op_8254_cast_fp16")];
+            fp32 var_8255_epsilon_0 = const()[name = string("op_8255_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8255_cast_fp16 = rsqrt(epsilon = var_8255_epsilon_0, x = var_8254_cast_fp16)[name = string("op_8255_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_207_cast_fp16 = mul(x = inputs_165_cast_fp16, y = var_8255_cast_fp16)[name = string("hidden_states_207_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_167_to_fp16 = const()[name = string("w_167_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(321129984)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_163_cast_fp16 = mul(x = w_167_to_fp16, y = hidden_states_207_cast_fp16)[name = string("input_163_cast_fp16")];
+            string input_165_pad_type_0 = const()[name = string("input_165_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_165_strides_0 = const()[name = string("input_165_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_165_pad_0 = const()[name = string("input_165_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_165_dilations_0 = const()[name = string("input_165_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_165_groups_0 = const()[name = string("input_165_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_20_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(321132096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(324277888))))[name = string("layers_20_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_165_cast_fp16 = conv(dilations = input_165_dilations_0, groups = input_165_groups_0, pad = input_165_pad_0, pad_type = input_165_pad_type_0, strides = input_165_strides_0, weight = layers_20_mlp_gate_proj_weight_to_fp16_palettized, x = input_163_cast_fp16)[name = string("input_165_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_8269_cast_fp16 = silu(x = input_165_cast_fp16)[name = string("op_8269_cast_fp16")];
+            string var_8275_pad_type_0 = const()[name = string("op_8275_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8275_strides_0 = const()[name = string("op_8275_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8275_pad_0 = const()[name = string("op_8275_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8275_dilations_0 = const()[name = string("op_8275_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8275_groups_0 = const()[name = string("op_8275_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_20_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(324278464))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327424256))))[name = string("layers_20_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_8275_cast_fp16 = conv(dilations = var_8275_dilations_0, groups = var_8275_groups_0, pad = var_8275_pad_0, pad_type = var_8275_pad_type_0, strides = var_8275_strides_0, weight = layers_20_mlp_up_proj_weight_to_fp16_palettized, x = input_163_cast_fp16)[name = string("op_8275_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_167_cast_fp16 = mul(x = var_8269_cast_fp16, y = var_8275_cast_fp16)[name = string("input_167_cast_fp16")];
+            string hidden_states_209_pad_type_0 = const()[name = string("hidden_states_209_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_209_strides_0 = const()[name = string("hidden_states_209_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_209_pad_0 = const()[name = string("hidden_states_209_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_209_dilations_0 = const()[name = string("hidden_states_209_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_209_groups_0 = const()[name = string("hidden_states_209_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_20_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327424832))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(330570624))))[name = string("layers_20_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_209_cast_fp16 = conv(dilations = hidden_states_209_dilations_0, groups = hidden_states_209_groups_0, pad = hidden_states_209_pad_0, pad_type = hidden_states_209_pad_type_0, strides = hidden_states_209_strides_0, weight = layers_20_mlp_down_proj_weight_to_fp16_palettized, x = input_167_cast_fp16)[name = string("hidden_states_209_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_167_cast_fp16 = add(x = inputs_165_cast_fp16, y = hidden_states_209_cast_fp16)[name = string("inputs_167_cast_fp16")];
+            int32 var_8289 = const()[name = string("op_8289"), val = int32(3)];
+            int32 var_8299 = const()[name = string("op_8299"), val = int32(-2)];
+            int32 var_8307 = const()[name = string("op_8307"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_169_cast_fp16 = mul(x = inputs_167_cast_fp16, y = inputs_167_cast_fp16)[name = string("inputs_sq_169_cast_fp16")];
+            tensor<int32, [1]> variance_169_axes_0 = const()[name = string("variance_169_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_169_keep_dims_0 = const()[name = string("variance_169_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_169_cast_fp16 = reduce_mean(axes = variance_169_axes_0, keep_dims = variance_169_keep_dims_0, x = inputs_sq_169_cast_fp16)[name = string("variance_169_cast_fp16")];
+            fp16 var_8319_to_fp16 = const()[name = string("op_8319_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8320_cast_fp16 = add(x = variance_169_cast_fp16, y = var_8319_to_fp16)[name = string("op_8320_cast_fp16")];
+            fp32 var_8321_epsilon_0 = const()[name = string("op_8321_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8321_cast_fp16 = rsqrt(epsilon = var_8321_epsilon_0, x = var_8320_cast_fp16)[name = string("op_8321_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_211_cast_fp16 = mul(x = inputs_167_cast_fp16, y = var_8321_cast_fp16)[name = string("hidden_states_211_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_169_to_fp16 = const()[name = string("w_169_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(330571200)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_173_cast_fp16 = mul(x = w_169_to_fp16, y = hidden_states_211_cast_fp16)[name = string("obj_173_cast_fp16")];
+            string query_127_pad_type_0 = const()[name = string("query_127_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_127_strides_0 = const()[name = string("query_127_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_127_pad_0 = const()[name = string("query_127_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_127_dilations_0 = const()[name = string("query_127_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_127_groups_0 = const()[name = string("query_127_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_21_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(330573312))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(332670528))))[name = string("layers_21_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_127_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_127_dilations_0, groups = query_127_groups_0, pad = query_127_pad_0, pad_type = query_127_pad_type_0, strides = query_127_strides_0, weight = layers_21_self_attn_q_proj_weight_to_fp16_palettized, x = obj_173_cast_fp16)[name = string("query_127_cast_fp16")];
+            string current_key_85_pad_type_0 = const()[name = string("current_key_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_85_strides_0 = const()[name = string("current_key_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_85_pad_0 = const()[name = string("current_key_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_85_dilations_0 = const()[name = string("current_key_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_85_groups_0 = const()[name = string("current_key_85_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(332671104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(333719744))))[name = string("layers_21_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_85_cast_fp16 = conv(dilations = current_key_85_dilations_0, groups = current_key_85_groups_0, pad = current_key_85_pad_0, pad_type = current_key_85_pad_type_0, strides = current_key_85_strides_0, weight = layers_21_self_attn_k_proj_weight_to_fp16_palettized, x = obj_173_cast_fp16)[name = string("current_key_85_cast_fp16")];
+            string current_value_43_pad_type_0 = const()[name = string("current_value_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_43_strides_0 = const()[name = string("current_value_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_43_pad_0 = const()[name = string("current_value_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_43_dilations_0 = const()[name = string("current_value_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_43_groups_0 = const()[name = string("current_value_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(333720320))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(334768960))))[name = string("layers_21_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_43_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_43_dilations_0, groups = current_value_43_groups_0, pad = current_value_43_pad_0, pad_type = current_value_43_pad_type_0, strides = current_value_43_strides_0, weight = layers_21_self_attn_v_proj_weight_to_fp16_palettized, x = obj_173_cast_fp16)[name = string("current_value_43_cast_fp16")];
+            tensor<int32, [4]> var_8358 = const()[name = string("op_8358"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_169_cast_fp16 = reshape(shape = var_8358, x = query_127_cast_fp16)[name = string("inputs_169_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_171_cast_fp16 = mul(x = inputs_169_cast_fp16, y = inputs_169_cast_fp16)[name = string("inputs_sq_171_cast_fp16")];
+            tensor<int32, [1]> variance_171_axes_0 = const()[name = string("variance_171_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_171_keep_dims_0 = const()[name = string("variance_171_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_171_cast_fp16 = reduce_mean(axes = variance_171_axes_0, keep_dims = variance_171_keep_dims_0, x = inputs_sq_171_cast_fp16)[name = string("variance_171_cast_fp16")];
+            fp16 var_8364_to_fp16 = const()[name = string("op_8364_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_8365_cast_fp16 = add(x = variance_171_cast_fp16, y = var_8364_to_fp16)[name = string("op_8365_cast_fp16")];
+            fp32 var_8366_epsilon_0 = const()[name = string("op_8366_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_8366_cast_fp16 = rsqrt(epsilon = var_8366_epsilon_0, x = var_8365_cast_fp16)[name = string("op_8366_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_213_cast_fp16 = mul(x = inputs_169_cast_fp16, y = var_8366_cast_fp16)[name = string("hidden_states_213_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_171_to_fp16 = const()[name = string("w_171_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(334769536)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_43_cast_fp16 = mul(x = w_171_to_fp16, y = hidden_states_213_cast_fp16)[name = string("query_normed_43_cast_fp16")];
+            tensor<int32, [4]> var_8374 = const()[name = string("op_8374"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_171_cast_fp16 = reshape(shape = var_8374, x = current_key_85_cast_fp16)[name = string("inputs_171_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_173_cast_fp16 = mul(x = inputs_171_cast_fp16, y = inputs_171_cast_fp16)[name = string("inputs_sq_173_cast_fp16")];
+            tensor<int32, [1]> variance_173_axes_0 = const()[name = string("variance_173_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_173_keep_dims_0 = const()[name = string("variance_173_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_173_cast_fp16 = reduce_mean(axes = variance_173_axes_0, keep_dims = variance_173_keep_dims_0, x = inputs_sq_173_cast_fp16)[name = string("variance_173_cast_fp16")];
+            fp16 var_8380_to_fp16 = const()[name = string("op_8380_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_8381_cast_fp16 = add(x = variance_173_cast_fp16, y = var_8380_to_fp16)[name = string("op_8381_cast_fp16")];
+            fp32 var_8382_epsilon_0 = const()[name = string("op_8382_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_8382_cast_fp16 = rsqrt(epsilon = var_8382_epsilon_0, x = var_8381_cast_fp16)[name = string("op_8382_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_215_cast_fp16 = mul(x = inputs_171_cast_fp16, y = var_8382_cast_fp16)[name = string("hidden_states_215_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_173_to_fp16 = const()[name = string("w_173_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(334769856)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_43_cast_fp16 = mul(x = w_173_to_fp16, y = hidden_states_215_cast_fp16)[name = string("current_key_normed_43_cast_fp16")];
+            tensor<int32, [4]> var_8400 = const()[name = string("op_8400"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_127_cast_fp16 = reshape(shape = var_8400, x = query_normed_43_cast_fp16)[name = string("mh_q_127_cast_fp16")];
+            tensor<int32, [4]> var_8402 = const()[name = string("op_8402"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_85_cast_fp16 = reshape(shape = var_8402, x = current_key_normed_43_cast_fp16)[name = string("mh_k_85_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8406_cast_fp16 = mul(x = mh_q_127_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8406_cast_fp16")];
+            tensor<int32, [4]> var_8411_begin_0 = const()[name = string("op_8411_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8411_end_0 = const()[name = string("op_8411_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_8411_end_mask_0 = const()[name = string("op_8411_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8411_cast_fp16 = slice_by_index(begin = var_8411_begin_0, end = var_8411_end_0, end_mask = var_8411_end_mask_0, x = mh_q_127_cast_fp16)[name = string("op_8411_cast_fp16")];
+            tensor<int32, [4]> var_8417_begin_0 = const()[name = string("op_8417_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8417_end_0 = const()[name = string("op_8417_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_8417_end_mask_0 = const()[name = string("op_8417_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8417_cast_fp16 = slice_by_index(begin = var_8417_begin_0, end = var_8417_end_0, end_mask = var_8417_end_mask_0, x = mh_q_127_cast_fp16)[name = string("op_8417_cast_fp16")];
+            fp16 const_500_promoted_to_fp16 = const()[name = string("const_500_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_8419_cast_fp16 = mul(x = var_8417_cast_fp16, y = const_500_promoted_to_fp16)[name = string("op_8419_cast_fp16")];
+            bool var_8421_interleave_0 = const()[name = string("op_8421_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_8421_cast_fp16 = concat(axis = var_8299, interleave = var_8421_interleave_0, values = (var_8419_cast_fp16, var_8411_cast_fp16))[name = string("op_8421_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8422_cast_fp16 = mul(x = var_8421_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8422_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_129_cast_fp16 = add(x = var_8406_cast_fp16, y = var_8422_cast_fp16)[name = string("mh_q_129_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8424_cast_fp16 = mul(x = mh_k_85_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8424_cast_fp16")];
+            tensor<int32, [4]> var_8429_begin_0 = const()[name = string("op_8429_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8429_end_0 = const()[name = string("op_8429_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_8429_end_mask_0 = const()[name = string("op_8429_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8429_cast_fp16 = slice_by_index(begin = var_8429_begin_0, end = var_8429_end_0, end_mask = var_8429_end_mask_0, x = mh_k_85_cast_fp16)[name = string("op_8429_cast_fp16")];
+            tensor<int32, [4]> var_8435_begin_0 = const()[name = string("op_8435_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8435_end_0 = const()[name = string("op_8435_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_8435_end_mask_0 = const()[name = string("op_8435_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8435_cast_fp16 = slice_by_index(begin = var_8435_begin_0, end = var_8435_end_0, end_mask = var_8435_end_mask_0, x = mh_k_85_cast_fp16)[name = string("op_8435_cast_fp16")];
+            fp16 const_503_promoted_to_fp16 = const()[name = string("const_503_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_8437_cast_fp16 = mul(x = var_8435_cast_fp16, y = const_503_promoted_to_fp16)[name = string("op_8437_cast_fp16")];
+            bool var_8439_interleave_0 = const()[name = string("op_8439_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_8439_cast_fp16 = concat(axis = var_8299, interleave = var_8439_interleave_0, values = (var_8437_cast_fp16, var_8429_cast_fp16))[name = string("op_8439_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8440_cast_fp16 = mul(x = var_8439_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8440_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_87_cast_fp16 = add(x = var_8424_cast_fp16, y = var_8440_cast_fp16)[name = string("mh_k_87_cast_fp16")];
+            tensor<int32, [4]> var_8444 = const()[name = string("op_8444"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_87_cast_fp16 = reshape(shape = var_8444, x = mh_k_87_cast_fp16)[name = string("current_key_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8451_cast_fp16 = mul(x = var_101_cast_fp16_21, y = var_323_cast_fp16)[name = string("op_8451_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8452_cast_fp16 = mul(x = current_key_87_cast_fp16, y = var_321_cast_fp16)[name = string("op_8452_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_129_cast_fp16 = add(x = var_8451_cast_fp16, y = var_8452_cast_fp16)[name = string("key_129_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8455_cast_fp16 = mul(x = var_132_cast_fp16_21, y = var_323_cast_fp16)[name = string("op_8455_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8456_cast_fp16 = mul(x = current_value_43_cast_fp16, y = var_321_cast_fp16)[name = string("op_8456_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_85_cast_fp16 = add(x = var_8455_cast_fp16, y = var_8456_cast_fp16)[name = string("value_85_cast_fp16")];
+            tensor<int32, [4]> var_8460 = const()[name = string("op_8460"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_85_cast_fp16 = reshape(shape = var_8460, x = key_129_cast_fp16)[name = string("key_heads_85_cast_fp16")];
+            tensor<int32, [4]> var_8462 = const()[name = string("op_8462"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_85_cast_fp16 = reshape(shape = var_8462, x = value_85_cast_fp16)[name = string("value_heads_85_cast_fp16")];
+            tensor<int32, [4]> var_8465_begin_0 = const()[name = string("op_8465_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8465_end_0 = const()[name = string("op_8465_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8465_end_mask_0 = const()[name = string("op_8465_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8465_cast_fp16 = slice_by_index(begin = var_8465_begin_0, end = var_8465_end_0, end_mask = var_8465_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8465_cast_fp16")];
+            tensor<int32, [4]> var_8469_begin_0 = const()[name = string("op_8469_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8469_end_0 = const()[name = string("op_8469_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8469_end_mask_0 = const()[name = string("op_8469_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8469_cast_fp16 = slice_by_index(begin = var_8469_begin_0, end = var_8469_end_0, end_mask = var_8469_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8469_cast_fp16")];
+            tensor<int32, [4]> var_8481_begin_0 = const()[name = string("op_8481_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8481_end_0 = const()[name = string("op_8481_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8481_end_mask_0 = const()[name = string("op_8481_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8481_cast_fp16 = slice_by_index(begin = var_8481_begin_0, end = var_8481_end_0, end_mask = var_8481_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8481_cast_fp16")];
+            tensor<int32, [4]> var_8485_begin_0 = const()[name = string("op_8485_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8485_end_0 = const()[name = string("op_8485_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8485_end_mask_0 = const()[name = string("op_8485_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8485_cast_fp16 = slice_by_index(begin = var_8485_begin_0, end = var_8485_end_0, end_mask = var_8485_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8485_cast_fp16")];
+            tensor<int32, [4]> var_8497_begin_0 = const()[name = string("op_8497_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8497_end_0 = const()[name = string("op_8497_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8497_end_mask_0 = const()[name = string("op_8497_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8497_cast_fp16 = slice_by_index(begin = var_8497_begin_0, end = var_8497_end_0, end_mask = var_8497_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8497_cast_fp16")];
+            tensor<int32, [4]> var_8501_begin_0 = const()[name = string("op_8501_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8501_end_0 = const()[name = string("op_8501_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8501_end_mask_0 = const()[name = string("op_8501_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8501_cast_fp16 = slice_by_index(begin = var_8501_begin_0, end = var_8501_end_0, end_mask = var_8501_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8501_cast_fp16")];
+            tensor<int32, [4]> var_8513_begin_0 = const()[name = string("op_8513_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8513_end_0 = const()[name = string("op_8513_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8513_end_mask_0 = const()[name = string("op_8513_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8513_cast_fp16 = slice_by_index(begin = var_8513_begin_0, end = var_8513_end_0, end_mask = var_8513_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8513_cast_fp16")];
+            tensor<int32, [4]> var_8517_begin_0 = const()[name = string("op_8517_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8517_end_0 = const()[name = string("op_8517_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8517_end_mask_0 = const()[name = string("op_8517_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8517_cast_fp16 = slice_by_index(begin = var_8517_begin_0, end = var_8517_end_0, end_mask = var_8517_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8517_cast_fp16")];
+            tensor<int32, [4]> var_8529_begin_0 = const()[name = string("op_8529_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8529_end_0 = const()[name = string("op_8529_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8529_end_mask_0 = const()[name = string("op_8529_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8529_cast_fp16 = slice_by_index(begin = var_8529_begin_0, end = var_8529_end_0, end_mask = var_8529_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8529_cast_fp16")];
+            tensor<int32, [4]> var_8533_begin_0 = const()[name = string("op_8533_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8533_end_0 = const()[name = string("op_8533_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8533_end_mask_0 = const()[name = string("op_8533_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8533_cast_fp16 = slice_by_index(begin = var_8533_begin_0, end = var_8533_end_0, end_mask = var_8533_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8533_cast_fp16")];
+            tensor<int32, [4]> var_8545_begin_0 = const()[name = string("op_8545_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8545_end_0 = const()[name = string("op_8545_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8545_end_mask_0 = const()[name = string("op_8545_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8545_cast_fp16 = slice_by_index(begin = var_8545_begin_0, end = var_8545_end_0, end_mask = var_8545_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8545_cast_fp16")];
+            tensor<int32, [4]> var_8549_begin_0 = const()[name = string("op_8549_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8549_end_0 = const()[name = string("op_8549_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8549_end_mask_0 = const()[name = string("op_8549_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8549_cast_fp16 = slice_by_index(begin = var_8549_begin_0, end = var_8549_end_0, end_mask = var_8549_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8549_cast_fp16")];
+            tensor<int32, [4]> var_8561_begin_0 = const()[name = string("op_8561_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8561_end_0 = const()[name = string("op_8561_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8561_end_mask_0 = const()[name = string("op_8561_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8561_cast_fp16 = slice_by_index(begin = var_8561_begin_0, end = var_8561_end_0, end_mask = var_8561_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8561_cast_fp16")];
+            tensor<int32, [4]> var_8565_begin_0 = const()[name = string("op_8565_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8565_end_0 = const()[name = string("op_8565_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8565_end_mask_0 = const()[name = string("op_8565_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8565_cast_fp16 = slice_by_index(begin = var_8565_begin_0, end = var_8565_end_0, end_mask = var_8565_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8565_cast_fp16")];
+            tensor<int32, [4]> var_8577_begin_0 = const()[name = string("op_8577_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8577_end_0 = const()[name = string("op_8577_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8577_end_mask_0 = const()[name = string("op_8577_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8577_cast_fp16 = slice_by_index(begin = var_8577_begin_0, end = var_8577_end_0, end_mask = var_8577_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8577_cast_fp16")];
+            tensor<int32, [4]> var_8581_begin_0 = const()[name = string("op_8581_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8581_end_0 = const()[name = string("op_8581_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8581_end_mask_0 = const()[name = string("op_8581_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8581_cast_fp16 = slice_by_index(begin = var_8581_begin_0, end = var_8581_end_0, end_mask = var_8581_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8581_cast_fp16")];
+            bool key_heads_87_interleave_0 = const()[name = string("key_heads_87_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_87_cast_fp16 = concat(axis = var_8307, interleave = key_heads_87_interleave_0, values = (var_8465_cast_fp16, var_8465_cast_fp16, var_8481_cast_fp16, var_8481_cast_fp16, var_8497_cast_fp16, var_8497_cast_fp16, var_8513_cast_fp16, var_8513_cast_fp16, var_8529_cast_fp16, var_8529_cast_fp16, var_8545_cast_fp16, var_8545_cast_fp16, var_8561_cast_fp16, var_8561_cast_fp16, var_8577_cast_fp16, var_8577_cast_fp16))[name = string("key_heads_87_cast_fp16")];
+            bool value_heads_87_interleave_0 = const()[name = string("value_heads_87_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_87_cast_fp16 = concat(axis = var_8307, interleave = value_heads_87_interleave_0, values = (var_8469_cast_fp16, var_8469_cast_fp16, var_8485_cast_fp16, var_8485_cast_fp16, var_8501_cast_fp16, var_8501_cast_fp16, var_8517_cast_fp16, var_8517_cast_fp16, var_8533_cast_fp16, var_8533_cast_fp16, var_8549_cast_fp16, var_8549_cast_fp16, var_8565_cast_fp16, var_8565_cast_fp16, var_8581_cast_fp16, var_8581_cast_fp16))[name = string("value_heads_87_cast_fp16")];
+            fp16 var_8604_to_fp16 = const()[name = string("op_8604_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_8605_cast_fp16 = mul(x = mh_q_129_cast_fp16, y = var_8604_to_fp16)[name = string("op_8605_cast_fp16")];
+            bool mh_w_85_transpose_x_0 = const()[name = string("mh_w_85_transpose_x_0"), val = bool(true)];
+            bool mh_w_85_transpose_y_0 = const()[name = string("mh_w_85_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_85_cast_fp16 = matmul(transpose_x = mh_w_85_transpose_x_0, transpose_y = mh_w_85_transpose_y_0, x = var_8605_cast_fp16, y = key_heads_87_cast_fp16)[name = string("mh_w_85_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_87_cast_fp16 = add(x = mh_w_85_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_87_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_8617_cast_fp16 = softmax(axis = var_8289, x = mh_w_87_cast_fp16)[name = string("op_8617_cast_fp16")];
+            bool attn_43_transpose_x_0 = const()[name = string("attn_43_transpose_x_0"), val = bool(false)];
+            bool attn_43_transpose_y_0 = const()[name = string("attn_43_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_43_cast_fp16 = matmul(transpose_x = attn_43_transpose_x_0, transpose_y = attn_43_transpose_y_0, x = value_heads_87_cast_fp16, y = var_8617_cast_fp16)[name = string("attn_43_cast_fp16")];
+            tensor<int32, [4]> var_8622 = const()[name = string("op_8622"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_169_cast_fp16 = reshape(shape = var_8622, x = attn_43_cast_fp16)[name = string("input_169_cast_fp16")];
+            string obj_179_pad_type_0 = const()[name = string("obj_179_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_179_strides_0 = const()[name = string("obj_179_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_179_pad_0 = const()[name = string("obj_179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_179_dilations_0 = const()[name = string("obj_179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_179_groups_0 = const()[name = string("obj_179_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_21_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(334770176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336867392))))[name = string("layers_21_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_179_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_179_dilations_0, groups = obj_179_groups_0, pad = obj_179_pad_0, pad_type = obj_179_pad_type_0, strides = obj_179_strides_0, weight = layers_21_self_attn_o_proj_weight_to_fp16_palettized, x = input_169_cast_fp16)[name = string("obj_179_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_173_cast_fp16 = add(x = inputs_167_cast_fp16, y = obj_179_cast_fp16)[name = string("inputs_173_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_175_cast_fp16 = mul(x = inputs_173_cast_fp16, y = inputs_173_cast_fp16)[name = string("inputs_sq_175_cast_fp16")];
+            tensor<int32, [1]> variance_175_axes_0 = const()[name = string("variance_175_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_175_keep_dims_0 = const()[name = string("variance_175_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_175_cast_fp16 = reduce_mean(axes = variance_175_axes_0, keep_dims = variance_175_keep_dims_0, x = inputs_sq_175_cast_fp16)[name = string("variance_175_cast_fp16")];
+            fp16 var_8640_to_fp16 = const()[name = string("op_8640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8641_cast_fp16 = add(x = variance_175_cast_fp16, y = var_8640_to_fp16)[name = string("op_8641_cast_fp16")];
+            fp32 var_8642_epsilon_0 = const()[name = string("op_8642_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8642_cast_fp16 = rsqrt(epsilon = var_8642_epsilon_0, x = var_8641_cast_fp16)[name = string("op_8642_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_217_cast_fp16 = mul(x = inputs_173_cast_fp16, y = var_8642_cast_fp16)[name = string("hidden_states_217_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_175_to_fp16 = const()[name = string("w_175_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336867968)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_171_cast_fp16 = mul(x = w_175_to_fp16, y = hidden_states_217_cast_fp16)[name = string("input_171_cast_fp16")];
+            string input_173_pad_type_0 = const()[name = string("input_173_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_173_strides_0 = const()[name = string("input_173_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_173_pad_0 = const()[name = string("input_173_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_173_dilations_0 = const()[name = string("input_173_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_173_groups_0 = const()[name = string("input_173_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_21_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(336870080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340015872))))[name = string("layers_21_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_173_cast_fp16 = conv(dilations = input_173_dilations_0, groups = input_173_groups_0, pad = input_173_pad_0, pad_type = input_173_pad_type_0, strides = input_173_strides_0, weight = layers_21_mlp_gate_proj_weight_to_fp16_palettized, x = input_171_cast_fp16)[name = string("input_173_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_8656_cast_fp16 = silu(x = input_173_cast_fp16)[name = string("op_8656_cast_fp16")];
+            string var_8662_pad_type_0 = const()[name = string("op_8662_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8662_strides_0 = const()[name = string("op_8662_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8662_pad_0 = const()[name = string("op_8662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8662_dilations_0 = const()[name = string("op_8662_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8662_groups_0 = const()[name = string("op_8662_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_21_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340016448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(343162240))))[name = string("layers_21_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_8662_cast_fp16 = conv(dilations = var_8662_dilations_0, groups = var_8662_groups_0, pad = var_8662_pad_0, pad_type = var_8662_pad_type_0, strides = var_8662_strides_0, weight = layers_21_mlp_up_proj_weight_to_fp16_palettized, x = input_171_cast_fp16)[name = string("op_8662_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_175_cast_fp16 = mul(x = var_8656_cast_fp16, y = var_8662_cast_fp16)[name = string("input_175_cast_fp16")];
+            string hidden_states_219_pad_type_0 = const()[name = string("hidden_states_219_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_219_strides_0 = const()[name = string("hidden_states_219_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_219_pad_0 = const()[name = string("hidden_states_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_219_dilations_0 = const()[name = string("hidden_states_219_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_219_groups_0 = const()[name = string("hidden_states_219_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_21_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(343162816))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346308608))))[name = string("layers_21_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_219_cast_fp16 = conv(dilations = hidden_states_219_dilations_0, groups = hidden_states_219_groups_0, pad = hidden_states_219_pad_0, pad_type = hidden_states_219_pad_type_0, strides = hidden_states_219_strides_0, weight = layers_21_mlp_down_proj_weight_to_fp16_palettized, x = input_175_cast_fp16)[name = string("hidden_states_219_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_175_cast_fp16 = add(x = inputs_173_cast_fp16, y = hidden_states_219_cast_fp16)[name = string("inputs_175_cast_fp16")];
+            int32 var_8676 = const()[name = string("op_8676"), val = int32(3)];
+            int32 var_8686 = const()[name = string("op_8686"), val = int32(-2)];
+            int32 var_8694 = const()[name = string("op_8694"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_177_cast_fp16 = mul(x = inputs_175_cast_fp16, y = inputs_175_cast_fp16)[name = string("inputs_sq_177_cast_fp16")];
+            tensor<int32, [1]> variance_177_axes_0 = const()[name = string("variance_177_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_177_keep_dims_0 = const()[name = string("variance_177_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_177_cast_fp16 = reduce_mean(axes = variance_177_axes_0, keep_dims = variance_177_keep_dims_0, x = inputs_sq_177_cast_fp16)[name = string("variance_177_cast_fp16")];
+            fp16 var_8706_to_fp16 = const()[name = string("op_8706_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8707_cast_fp16 = add(x = variance_177_cast_fp16, y = var_8706_to_fp16)[name = string("op_8707_cast_fp16")];
+            fp32 var_8708_epsilon_0 = const()[name = string("op_8708_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8708_cast_fp16 = rsqrt(epsilon = var_8708_epsilon_0, x = var_8707_cast_fp16)[name = string("op_8708_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_221_cast_fp16 = mul(x = inputs_175_cast_fp16, y = var_8708_cast_fp16)[name = string("hidden_states_221_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_177_to_fp16 = const()[name = string("w_177_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346309184)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_181_cast_fp16 = mul(x = w_177_to_fp16, y = hidden_states_221_cast_fp16)[name = string("obj_181_cast_fp16")];
+            string query_133_pad_type_0 = const()[name = string("query_133_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_133_strides_0 = const()[name = string("query_133_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_133_pad_0 = const()[name = string("query_133_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_133_dilations_0 = const()[name = string("query_133_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_133_groups_0 = const()[name = string("query_133_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_22_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346311296))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(348408512))))[name = string("layers_22_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_133_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_133_dilations_0, groups = query_133_groups_0, pad = query_133_pad_0, pad_type = query_133_pad_type_0, strides = query_133_strides_0, weight = layers_22_self_attn_q_proj_weight_to_fp16_palettized, x = obj_181_cast_fp16)[name = string("query_133_cast_fp16")];
+            string current_key_89_pad_type_0 = const()[name = string("current_key_89_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_89_strides_0 = const()[name = string("current_key_89_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_89_pad_0 = const()[name = string("current_key_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_89_dilations_0 = const()[name = string("current_key_89_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_89_groups_0 = const()[name = string("current_key_89_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(348409088))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(349457728))))[name = string("layers_22_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_89_cast_fp16 = conv(dilations = current_key_89_dilations_0, groups = current_key_89_groups_0, pad = current_key_89_pad_0, pad_type = current_key_89_pad_type_0, strides = current_key_89_strides_0, weight = layers_22_self_attn_k_proj_weight_to_fp16_palettized, x = obj_181_cast_fp16)[name = string("current_key_89_cast_fp16")];
+            string current_value_45_pad_type_0 = const()[name = string("current_value_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_45_strides_0 = const()[name = string("current_value_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_45_pad_0 = const()[name = string("current_value_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_45_dilations_0 = const()[name = string("current_value_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_45_groups_0 = const()[name = string("current_value_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(349458304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(350506944))))[name = string("layers_22_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_45_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_45_dilations_0, groups = current_value_45_groups_0, pad = current_value_45_pad_0, pad_type = current_value_45_pad_type_0, strides = current_value_45_strides_0, weight = layers_22_self_attn_v_proj_weight_to_fp16_palettized, x = obj_181_cast_fp16)[name = string("current_value_45_cast_fp16")];
+            tensor<int32, [4]> var_8745 = const()[name = string("op_8745"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_177_cast_fp16 = reshape(shape = var_8745, x = query_133_cast_fp16)[name = string("inputs_177_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_179_cast_fp16 = mul(x = inputs_177_cast_fp16, y = inputs_177_cast_fp16)[name = string("inputs_sq_179_cast_fp16")];
+            tensor<int32, [1]> variance_179_axes_0 = const()[name = string("variance_179_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_179_keep_dims_0 = const()[name = string("variance_179_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_179_cast_fp16 = reduce_mean(axes = variance_179_axes_0, keep_dims = variance_179_keep_dims_0, x = inputs_sq_179_cast_fp16)[name = string("variance_179_cast_fp16")];
+            fp16 var_8751_to_fp16 = const()[name = string("op_8751_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_8752_cast_fp16 = add(x = variance_179_cast_fp16, y = var_8751_to_fp16)[name = string("op_8752_cast_fp16")];
+            fp32 var_8753_epsilon_0 = const()[name = string("op_8753_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_8753_cast_fp16 = rsqrt(epsilon = var_8753_epsilon_0, x = var_8752_cast_fp16)[name = string("op_8753_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_223_cast_fp16 = mul(x = inputs_177_cast_fp16, y = var_8753_cast_fp16)[name = string("hidden_states_223_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_179_to_fp16 = const()[name = string("w_179_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(350507520)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_45_cast_fp16 = mul(x = w_179_to_fp16, y = hidden_states_223_cast_fp16)[name = string("query_normed_45_cast_fp16")];
+            tensor<int32, [4]> var_8761 = const()[name = string("op_8761"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_179_cast_fp16 = reshape(shape = var_8761, x = current_key_89_cast_fp16)[name = string("inputs_179_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_181_cast_fp16 = mul(x = inputs_179_cast_fp16, y = inputs_179_cast_fp16)[name = string("inputs_sq_181_cast_fp16")];
+            tensor<int32, [1]> variance_181_axes_0 = const()[name = string("variance_181_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_181_keep_dims_0 = const()[name = string("variance_181_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_181_cast_fp16 = reduce_mean(axes = variance_181_axes_0, keep_dims = variance_181_keep_dims_0, x = inputs_sq_181_cast_fp16)[name = string("variance_181_cast_fp16")];
+            fp16 var_8767_to_fp16 = const()[name = string("op_8767_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_8768_cast_fp16 = add(x = variance_181_cast_fp16, y = var_8767_to_fp16)[name = string("op_8768_cast_fp16")];
+            fp32 var_8769_epsilon_0 = const()[name = string("op_8769_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_8769_cast_fp16 = rsqrt(epsilon = var_8769_epsilon_0, x = var_8768_cast_fp16)[name = string("op_8769_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_225_cast_fp16 = mul(x = inputs_179_cast_fp16, y = var_8769_cast_fp16)[name = string("hidden_states_225_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_181_to_fp16 = const()[name = string("w_181_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(350507840)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_45_cast_fp16 = mul(x = w_181_to_fp16, y = hidden_states_225_cast_fp16)[name = string("current_key_normed_45_cast_fp16")];
+            tensor<int32, [4]> var_8787 = const()[name = string("op_8787"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_133_cast_fp16 = reshape(shape = var_8787, x = query_normed_45_cast_fp16)[name = string("mh_q_133_cast_fp16")];
+            tensor<int32, [4]> var_8789 = const()[name = string("op_8789"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_89_cast_fp16 = reshape(shape = var_8789, x = current_key_normed_45_cast_fp16)[name = string("mh_k_89_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8793_cast_fp16 = mul(x = mh_q_133_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8793_cast_fp16")];
+            tensor<int32, [4]> var_8798_begin_0 = const()[name = string("op_8798_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8798_end_0 = const()[name = string("op_8798_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_8798_end_mask_0 = const()[name = string("op_8798_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8798_cast_fp16 = slice_by_index(begin = var_8798_begin_0, end = var_8798_end_0, end_mask = var_8798_end_mask_0, x = mh_q_133_cast_fp16)[name = string("op_8798_cast_fp16")];
+            tensor<int32, [4]> var_8804_begin_0 = const()[name = string("op_8804_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8804_end_0 = const()[name = string("op_8804_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_8804_end_mask_0 = const()[name = string("op_8804_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8804_cast_fp16 = slice_by_index(begin = var_8804_begin_0, end = var_8804_end_0, end_mask = var_8804_end_mask_0, x = mh_q_133_cast_fp16)[name = string("op_8804_cast_fp16")];
+            fp16 const_523_promoted_to_fp16 = const()[name = string("const_523_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_8806_cast_fp16 = mul(x = var_8804_cast_fp16, y = const_523_promoted_to_fp16)[name = string("op_8806_cast_fp16")];
+            bool var_8808_interleave_0 = const()[name = string("op_8808_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_8808_cast_fp16 = concat(axis = var_8686, interleave = var_8808_interleave_0, values = (var_8806_cast_fp16, var_8798_cast_fp16))[name = string("op_8808_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8809_cast_fp16 = mul(x = var_8808_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8809_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_135_cast_fp16 = add(x = var_8793_cast_fp16, y = var_8809_cast_fp16)[name = string("mh_q_135_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8811_cast_fp16 = mul(x = mh_k_89_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8811_cast_fp16")];
+            tensor<int32, [4]> var_8816_begin_0 = const()[name = string("op_8816_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8816_end_0 = const()[name = string("op_8816_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_8816_end_mask_0 = const()[name = string("op_8816_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8816_cast_fp16 = slice_by_index(begin = var_8816_begin_0, end = var_8816_end_0, end_mask = var_8816_end_mask_0, x = mh_k_89_cast_fp16)[name = string("op_8816_cast_fp16")];
+            tensor<int32, [4]> var_8822_begin_0 = const()[name = string("op_8822_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8822_end_0 = const()[name = string("op_8822_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_8822_end_mask_0 = const()[name = string("op_8822_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8822_cast_fp16 = slice_by_index(begin = var_8822_begin_0, end = var_8822_end_0, end_mask = var_8822_end_mask_0, x = mh_k_89_cast_fp16)[name = string("op_8822_cast_fp16")];
+            fp16 const_526_promoted_to_fp16 = const()[name = string("const_526_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_8824_cast_fp16 = mul(x = var_8822_cast_fp16, y = const_526_promoted_to_fp16)[name = string("op_8824_cast_fp16")];
+            bool var_8826_interleave_0 = const()[name = string("op_8826_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_8826_cast_fp16 = concat(axis = var_8686, interleave = var_8826_interleave_0, values = (var_8824_cast_fp16, var_8816_cast_fp16))[name = string("op_8826_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8827_cast_fp16 = mul(x = var_8826_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8827_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_91_cast_fp16 = add(x = var_8811_cast_fp16, y = var_8827_cast_fp16)[name = string("mh_k_91_cast_fp16")];
+            tensor<int32, [4]> var_8831 = const()[name = string("op_8831"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_91_cast_fp16 = reshape(shape = var_8831, x = mh_k_91_cast_fp16)[name = string("current_key_91_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8838_cast_fp16 = mul(x = var_101_cast_fp16_22, y = var_323_cast_fp16)[name = string("op_8838_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8839_cast_fp16 = mul(x = current_key_91_cast_fp16, y = var_321_cast_fp16)[name = string("op_8839_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_135_cast_fp16 = add(x = var_8838_cast_fp16, y = var_8839_cast_fp16)[name = string("key_135_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8842_cast_fp16 = mul(x = var_132_cast_fp16_22, y = var_323_cast_fp16)[name = string("op_8842_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8843_cast_fp16 = mul(x = current_value_45_cast_fp16, y = var_321_cast_fp16)[name = string("op_8843_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_89_cast_fp16 = add(x = var_8842_cast_fp16, y = var_8843_cast_fp16)[name = string("value_89_cast_fp16")];
+            tensor<int32, [4]> var_8847 = const()[name = string("op_8847"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_89_cast_fp16 = reshape(shape = var_8847, x = key_135_cast_fp16)[name = string("key_heads_89_cast_fp16")];
+            tensor<int32, [4]> var_8849 = const()[name = string("op_8849"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_89_cast_fp16 = reshape(shape = var_8849, x = value_89_cast_fp16)[name = string("value_heads_89_cast_fp16")];
+            tensor<int32, [4]> var_8852_begin_0 = const()[name = string("op_8852_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8852_end_0 = const()[name = string("op_8852_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8852_end_mask_0 = const()[name = string("op_8852_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8852_cast_fp16 = slice_by_index(begin = var_8852_begin_0, end = var_8852_end_0, end_mask = var_8852_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8852_cast_fp16")];
+            tensor<int32, [4]> var_8856_begin_0 = const()[name = string("op_8856_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8856_end_0 = const()[name = string("op_8856_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8856_end_mask_0 = const()[name = string("op_8856_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8856_cast_fp16 = slice_by_index(begin = var_8856_begin_0, end = var_8856_end_0, end_mask = var_8856_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8856_cast_fp16")];
+            tensor<int32, [4]> var_8868_begin_0 = const()[name = string("op_8868_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8868_end_0 = const()[name = string("op_8868_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8868_end_mask_0 = const()[name = string("op_8868_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8868_cast_fp16 = slice_by_index(begin = var_8868_begin_0, end = var_8868_end_0, end_mask = var_8868_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8868_cast_fp16")];
+            tensor<int32, [4]> var_8872_begin_0 = const()[name = string("op_8872_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8872_end_0 = const()[name = string("op_8872_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8872_end_mask_0 = const()[name = string("op_8872_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8872_cast_fp16 = slice_by_index(begin = var_8872_begin_0, end = var_8872_end_0, end_mask = var_8872_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8872_cast_fp16")];
+            tensor<int32, [4]> var_8884_begin_0 = const()[name = string("op_8884_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8884_end_0 = const()[name = string("op_8884_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8884_end_mask_0 = const()[name = string("op_8884_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8884_cast_fp16 = slice_by_index(begin = var_8884_begin_0, end = var_8884_end_0, end_mask = var_8884_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8884_cast_fp16")];
+            tensor<int32, [4]> var_8888_begin_0 = const()[name = string("op_8888_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8888_end_0 = const()[name = string("op_8888_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8888_end_mask_0 = const()[name = string("op_8888_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8888_cast_fp16 = slice_by_index(begin = var_8888_begin_0, end = var_8888_end_0, end_mask = var_8888_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8888_cast_fp16")];
+            tensor<int32, [4]> var_8900_begin_0 = const()[name = string("op_8900_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8900_end_0 = const()[name = string("op_8900_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8900_end_mask_0 = const()[name = string("op_8900_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8900_cast_fp16 = slice_by_index(begin = var_8900_begin_0, end = var_8900_end_0, end_mask = var_8900_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8900_cast_fp16")];
+            tensor<int32, [4]> var_8904_begin_0 = const()[name = string("op_8904_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8904_end_0 = const()[name = string("op_8904_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8904_end_mask_0 = const()[name = string("op_8904_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8904_cast_fp16 = slice_by_index(begin = var_8904_begin_0, end = var_8904_end_0, end_mask = var_8904_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8904_cast_fp16")];
+            tensor<int32, [4]> var_8916_begin_0 = const()[name = string("op_8916_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8916_end_0 = const()[name = string("op_8916_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8916_end_mask_0 = const()[name = string("op_8916_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8916_cast_fp16 = slice_by_index(begin = var_8916_begin_0, end = var_8916_end_0, end_mask = var_8916_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8916_cast_fp16")];
+            tensor<int32, [4]> var_8920_begin_0 = const()[name = string("op_8920_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8920_end_0 = const()[name = string("op_8920_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8920_end_mask_0 = const()[name = string("op_8920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8920_cast_fp16 = slice_by_index(begin = var_8920_begin_0, end = var_8920_end_0, end_mask = var_8920_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8920_cast_fp16")];
+            tensor<int32, [4]> var_8932_begin_0 = const()[name = string("op_8932_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8932_end_0 = const()[name = string("op_8932_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8932_end_mask_0 = const()[name = string("op_8932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8932_cast_fp16 = slice_by_index(begin = var_8932_begin_0, end = var_8932_end_0, end_mask = var_8932_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8932_cast_fp16")];
+            tensor<int32, [4]> var_8936_begin_0 = const()[name = string("op_8936_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8936_end_0 = const()[name = string("op_8936_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8936_end_mask_0 = const()[name = string("op_8936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8936_cast_fp16 = slice_by_index(begin = var_8936_begin_0, end = var_8936_end_0, end_mask = var_8936_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8936_cast_fp16")];
+            tensor<int32, [4]> var_8948_begin_0 = const()[name = string("op_8948_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8948_end_0 = const()[name = string("op_8948_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8948_end_mask_0 = const()[name = string("op_8948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8948_cast_fp16 = slice_by_index(begin = var_8948_begin_0, end = var_8948_end_0, end_mask = var_8948_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8948_cast_fp16")];
+            tensor<int32, [4]> var_8952_begin_0 = const()[name = string("op_8952_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8952_end_0 = const()[name = string("op_8952_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8952_end_mask_0 = const()[name = string("op_8952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8952_cast_fp16 = slice_by_index(begin = var_8952_begin_0, end = var_8952_end_0, end_mask = var_8952_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8952_cast_fp16")];
+            tensor<int32, [4]> var_8964_begin_0 = const()[name = string("op_8964_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8964_end_0 = const()[name = string("op_8964_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8964_end_mask_0 = const()[name = string("op_8964_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8964_cast_fp16 = slice_by_index(begin = var_8964_begin_0, end = var_8964_end_0, end_mask = var_8964_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8964_cast_fp16")];
+            tensor<int32, [4]> var_8968_begin_0 = const()[name = string("op_8968_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8968_end_0 = const()[name = string("op_8968_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8968_end_mask_0 = const()[name = string("op_8968_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8968_cast_fp16 = slice_by_index(begin = var_8968_begin_0, end = var_8968_end_0, end_mask = var_8968_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8968_cast_fp16")];
+            bool key_heads_91_interleave_0 = const()[name = string("key_heads_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_91_cast_fp16 = concat(axis = var_8694, interleave = key_heads_91_interleave_0, values = (var_8852_cast_fp16, var_8852_cast_fp16, var_8868_cast_fp16, var_8868_cast_fp16, var_8884_cast_fp16, var_8884_cast_fp16, var_8900_cast_fp16, var_8900_cast_fp16, var_8916_cast_fp16, var_8916_cast_fp16, var_8932_cast_fp16, var_8932_cast_fp16, var_8948_cast_fp16, var_8948_cast_fp16, var_8964_cast_fp16, var_8964_cast_fp16))[name = string("key_heads_91_cast_fp16")];
+            bool value_heads_91_interleave_0 = const()[name = string("value_heads_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_91_cast_fp16 = concat(axis = var_8694, interleave = value_heads_91_interleave_0, values = (var_8856_cast_fp16, var_8856_cast_fp16, var_8872_cast_fp16, var_8872_cast_fp16, var_8888_cast_fp16, var_8888_cast_fp16, var_8904_cast_fp16, var_8904_cast_fp16, var_8920_cast_fp16, var_8920_cast_fp16, var_8936_cast_fp16, var_8936_cast_fp16, var_8952_cast_fp16, var_8952_cast_fp16, var_8968_cast_fp16, var_8968_cast_fp16))[name = string("value_heads_91_cast_fp16")];
+            fp16 var_8991_to_fp16 = const()[name = string("op_8991_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_8992_cast_fp16 = mul(x = mh_q_135_cast_fp16, y = var_8991_to_fp16)[name = string("op_8992_cast_fp16")];
+            bool mh_w_89_transpose_x_0 = const()[name = string("mh_w_89_transpose_x_0"), val = bool(true)];
+            bool mh_w_89_transpose_y_0 = const()[name = string("mh_w_89_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_89_cast_fp16 = matmul(transpose_x = mh_w_89_transpose_x_0, transpose_y = mh_w_89_transpose_y_0, x = var_8992_cast_fp16, y = key_heads_91_cast_fp16)[name = string("mh_w_89_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_91_cast_fp16 = add(x = mh_w_89_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_91_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_9004_cast_fp16 = softmax(axis = var_8676, x = mh_w_91_cast_fp16)[name = string("op_9004_cast_fp16")];
+            bool attn_45_transpose_x_0 = const()[name = string("attn_45_transpose_x_0"), val = bool(false)];
+            bool attn_45_transpose_y_0 = const()[name = string("attn_45_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_45_cast_fp16 = matmul(transpose_x = attn_45_transpose_x_0, transpose_y = attn_45_transpose_y_0, x = value_heads_91_cast_fp16, y = var_9004_cast_fp16)[name = string("attn_45_cast_fp16")];
+            tensor<int32, [4]> var_9009 = const()[name = string("op_9009"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_177_cast_fp16 = reshape(shape = var_9009, x = attn_45_cast_fp16)[name = string("input_177_cast_fp16")];
+            string obj_187_pad_type_0 = const()[name = string("obj_187_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_187_strides_0 = const()[name = string("obj_187_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_187_pad_0 = const()[name = string("obj_187_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_187_dilations_0 = const()[name = string("obj_187_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_187_groups_0 = const()[name = string("obj_187_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_22_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(350508160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352605376))))[name = string("layers_22_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_187_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_187_dilations_0, groups = obj_187_groups_0, pad = obj_187_pad_0, pad_type = obj_187_pad_type_0, strides = obj_187_strides_0, weight = layers_22_self_attn_o_proj_weight_to_fp16_palettized, x = input_177_cast_fp16)[name = string("obj_187_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_181_cast_fp16 = add(x = inputs_175_cast_fp16, y = obj_187_cast_fp16)[name = string("inputs_181_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_183_cast_fp16 = mul(x = inputs_181_cast_fp16, y = inputs_181_cast_fp16)[name = string("inputs_sq_183_cast_fp16")];
+            tensor<int32, [1]> variance_183_axes_0 = const()[name = string("variance_183_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_183_keep_dims_0 = const()[name = string("variance_183_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_183_cast_fp16 = reduce_mean(axes = variance_183_axes_0, keep_dims = variance_183_keep_dims_0, x = inputs_sq_183_cast_fp16)[name = string("variance_183_cast_fp16")];
+            fp16 var_9027_to_fp16 = const()[name = string("op_9027_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9028_cast_fp16 = add(x = variance_183_cast_fp16, y = var_9027_to_fp16)[name = string("op_9028_cast_fp16")];
+            fp32 var_9029_epsilon_0 = const()[name = string("op_9029_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9029_cast_fp16 = rsqrt(epsilon = var_9029_epsilon_0, x = var_9028_cast_fp16)[name = string("op_9029_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_227_cast_fp16 = mul(x = inputs_181_cast_fp16, y = var_9029_cast_fp16)[name = string("hidden_states_227_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_183_to_fp16 = const()[name = string("w_183_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352605952)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_179_cast_fp16 = mul(x = w_183_to_fp16, y = hidden_states_227_cast_fp16)[name = string("input_179_cast_fp16")];
+            string input_181_pad_type_0 = const()[name = string("input_181_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_181_strides_0 = const()[name = string("input_181_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_181_pad_0 = const()[name = string("input_181_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_181_dilations_0 = const()[name = string("input_181_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_181_groups_0 = const()[name = string("input_181_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_22_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352608064))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355753856))))[name = string("layers_22_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_181_cast_fp16 = conv(dilations = input_181_dilations_0, groups = input_181_groups_0, pad = input_181_pad_0, pad_type = input_181_pad_type_0, strides = input_181_strides_0, weight = layers_22_mlp_gate_proj_weight_to_fp16_palettized, x = input_179_cast_fp16)[name = string("input_181_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_9043_cast_fp16 = silu(x = input_181_cast_fp16)[name = string("op_9043_cast_fp16")];
+            string var_9049_pad_type_0 = const()[name = string("op_9049_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9049_strides_0 = const()[name = string("op_9049_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9049_pad_0 = const()[name = string("op_9049_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9049_dilations_0 = const()[name = string("op_9049_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9049_groups_0 = const()[name = string("op_9049_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_22_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355754432))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(358900224))))[name = string("layers_22_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_9049_cast_fp16 = conv(dilations = var_9049_dilations_0, groups = var_9049_groups_0, pad = var_9049_pad_0, pad_type = var_9049_pad_type_0, strides = var_9049_strides_0, weight = layers_22_mlp_up_proj_weight_to_fp16_palettized, x = input_179_cast_fp16)[name = string("op_9049_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_183_cast_fp16 = mul(x = var_9043_cast_fp16, y = var_9049_cast_fp16)[name = string("input_183_cast_fp16")];
+            string hidden_states_229_pad_type_0 = const()[name = string("hidden_states_229_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_229_strides_0 = const()[name = string("hidden_states_229_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_229_pad_0 = const()[name = string("hidden_states_229_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_229_dilations_0 = const()[name = string("hidden_states_229_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_229_groups_0 = const()[name = string("hidden_states_229_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_22_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(358900800))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(362046592))))[name = string("layers_22_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_229_cast_fp16 = conv(dilations = hidden_states_229_dilations_0, groups = hidden_states_229_groups_0, pad = hidden_states_229_pad_0, pad_type = hidden_states_229_pad_type_0, strides = hidden_states_229_strides_0, weight = layers_22_mlp_down_proj_weight_to_fp16_palettized, x = input_183_cast_fp16)[name = string("hidden_states_229_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_183_cast_fp16 = add(x = inputs_181_cast_fp16, y = hidden_states_229_cast_fp16)[name = string("inputs_183_cast_fp16")];
+            int32 var_9063 = const()[name = string("op_9063"), val = int32(3)];
+            int32 var_9073 = const()[name = string("op_9073"), val = int32(-2)];
+            int32 var_9081 = const()[name = string("op_9081"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_185_cast_fp16 = mul(x = inputs_183_cast_fp16, y = inputs_183_cast_fp16)[name = string("inputs_sq_185_cast_fp16")];
+            tensor<int32, [1]> variance_185_axes_0 = const()[name = string("variance_185_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_185_keep_dims_0 = const()[name = string("variance_185_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_185_cast_fp16 = reduce_mean(axes = variance_185_axes_0, keep_dims = variance_185_keep_dims_0, x = inputs_sq_185_cast_fp16)[name = string("variance_185_cast_fp16")];
+            fp16 var_9093_to_fp16 = const()[name = string("op_9093_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9094_cast_fp16 = add(x = variance_185_cast_fp16, y = var_9093_to_fp16)[name = string("op_9094_cast_fp16")];
+            fp32 var_9095_epsilon_0 = const()[name = string("op_9095_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9095_cast_fp16 = rsqrt(epsilon = var_9095_epsilon_0, x = var_9094_cast_fp16)[name = string("op_9095_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_231_cast_fp16 = mul(x = inputs_183_cast_fp16, y = var_9095_cast_fp16)[name = string("hidden_states_231_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_185_to_fp16 = const()[name = string("w_185_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(362047168)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_189_cast_fp16 = mul(x = w_185_to_fp16, y = hidden_states_231_cast_fp16)[name = string("obj_189_cast_fp16")];
+            string query_139_pad_type_0 = const()[name = string("query_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_139_strides_0 = const()[name = string("query_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_139_pad_0 = const()[name = string("query_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_139_dilations_0 = const()[name = string("query_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_139_groups_0 = const()[name = string("query_139_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_23_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(362049280))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(364146496))))[name = string("layers_23_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_139_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_139_dilations_0, groups = query_139_groups_0, pad = query_139_pad_0, pad_type = query_139_pad_type_0, strides = query_139_strides_0, weight = layers_23_self_attn_q_proj_weight_to_fp16_palettized, x = obj_189_cast_fp16)[name = string("query_139_cast_fp16")];
+            string current_key_93_pad_type_0 = const()[name = string("current_key_93_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_93_strides_0 = const()[name = string("current_key_93_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_93_pad_0 = const()[name = string("current_key_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_93_dilations_0 = const()[name = string("current_key_93_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_93_groups_0 = const()[name = string("current_key_93_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(364147072))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(365195712))))[name = string("layers_23_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_93_cast_fp16 = conv(dilations = current_key_93_dilations_0, groups = current_key_93_groups_0, pad = current_key_93_pad_0, pad_type = current_key_93_pad_type_0, strides = current_key_93_strides_0, weight = layers_23_self_attn_k_proj_weight_to_fp16_palettized, x = obj_189_cast_fp16)[name = string("current_key_93_cast_fp16")];
+            string current_value_47_pad_type_0 = const()[name = string("current_value_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_47_strides_0 = const()[name = string("current_value_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_47_pad_0 = const()[name = string("current_value_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_47_dilations_0 = const()[name = string("current_value_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_47_groups_0 = const()[name = string("current_value_47_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(365196288))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366244928))))[name = string("layers_23_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_47_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_47_dilations_0, groups = current_value_47_groups_0, pad = current_value_47_pad_0, pad_type = current_value_47_pad_type_0, strides = current_value_47_strides_0, weight = layers_23_self_attn_v_proj_weight_to_fp16_palettized, x = obj_189_cast_fp16)[name = string("current_value_47_cast_fp16")];
+            tensor<int32, [4]> var_9132 = const()[name = string("op_9132"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_185_cast_fp16 = reshape(shape = var_9132, x = query_139_cast_fp16)[name = string("inputs_185_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_187_cast_fp16 = mul(x = inputs_185_cast_fp16, y = inputs_185_cast_fp16)[name = string("inputs_sq_187_cast_fp16")];
+            tensor<int32, [1]> variance_187_axes_0 = const()[name = string("variance_187_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_187_keep_dims_0 = const()[name = string("variance_187_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_187_cast_fp16 = reduce_mean(axes = variance_187_axes_0, keep_dims = variance_187_keep_dims_0, x = inputs_sq_187_cast_fp16)[name = string("variance_187_cast_fp16")];
+            fp16 var_9138_to_fp16 = const()[name = string("op_9138_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_9139_cast_fp16 = add(x = variance_187_cast_fp16, y = var_9138_to_fp16)[name = string("op_9139_cast_fp16")];
+            fp32 var_9140_epsilon_0 = const()[name = string("op_9140_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_9140_cast_fp16 = rsqrt(epsilon = var_9140_epsilon_0, x = var_9139_cast_fp16)[name = string("op_9140_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_233_cast_fp16 = mul(x = inputs_185_cast_fp16, y = var_9140_cast_fp16)[name = string("hidden_states_233_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_187_to_fp16 = const()[name = string("w_187_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366245504)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_47_cast_fp16 = mul(x = w_187_to_fp16, y = hidden_states_233_cast_fp16)[name = string("query_normed_47_cast_fp16")];
+            tensor<int32, [4]> var_9148 = const()[name = string("op_9148"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_187_cast_fp16 = reshape(shape = var_9148, x = current_key_93_cast_fp16)[name = string("inputs_187_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_189_cast_fp16 = mul(x = inputs_187_cast_fp16, y = inputs_187_cast_fp16)[name = string("inputs_sq_189_cast_fp16")];
+            tensor<int32, [1]> variance_189_axes_0 = const()[name = string("variance_189_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_189_keep_dims_0 = const()[name = string("variance_189_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_189_cast_fp16 = reduce_mean(axes = variance_189_axes_0, keep_dims = variance_189_keep_dims_0, x = inputs_sq_189_cast_fp16)[name = string("variance_189_cast_fp16")];
+            fp16 var_9154_to_fp16 = const()[name = string("op_9154_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_9155_cast_fp16 = add(x = variance_189_cast_fp16, y = var_9154_to_fp16)[name = string("op_9155_cast_fp16")];
+            fp32 var_9156_epsilon_0 = const()[name = string("op_9156_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_9156_cast_fp16 = rsqrt(epsilon = var_9156_epsilon_0, x = var_9155_cast_fp16)[name = string("op_9156_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_235_cast_fp16 = mul(x = inputs_187_cast_fp16, y = var_9156_cast_fp16)[name = string("hidden_states_235_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_189_to_fp16 = const()[name = string("w_189_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366245824)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_47_cast_fp16 = mul(x = w_189_to_fp16, y = hidden_states_235_cast_fp16)[name = string("current_key_normed_47_cast_fp16")];
+            tensor<int32, [4]> var_9174 = const()[name = string("op_9174"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_139_cast_fp16 = reshape(shape = var_9174, x = query_normed_47_cast_fp16)[name = string("mh_q_139_cast_fp16")];
+            tensor<int32, [4]> var_9176 = const()[name = string("op_9176"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_93_cast_fp16 = reshape(shape = var_9176, x = current_key_normed_47_cast_fp16)[name = string("mh_k_93_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9180_cast_fp16 = mul(x = mh_q_139_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9180_cast_fp16")];
+            tensor<int32, [4]> var_9185_begin_0 = const()[name = string("op_9185_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9185_end_0 = const()[name = string("op_9185_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_9185_end_mask_0 = const()[name = string("op_9185_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9185_cast_fp16 = slice_by_index(begin = var_9185_begin_0, end = var_9185_end_0, end_mask = var_9185_end_mask_0, x = mh_q_139_cast_fp16)[name = string("op_9185_cast_fp16")];
+            tensor<int32, [4]> var_9191_begin_0 = const()[name = string("op_9191_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9191_end_0 = const()[name = string("op_9191_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_9191_end_mask_0 = const()[name = string("op_9191_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9191_cast_fp16 = slice_by_index(begin = var_9191_begin_0, end = var_9191_end_0, end_mask = var_9191_end_mask_0, x = mh_q_139_cast_fp16)[name = string("op_9191_cast_fp16")];
+            fp16 const_546_promoted_to_fp16 = const()[name = string("const_546_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_9193_cast_fp16 = mul(x = var_9191_cast_fp16, y = const_546_promoted_to_fp16)[name = string("op_9193_cast_fp16")];
+            bool var_9195_interleave_0 = const()[name = string("op_9195_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_9195_cast_fp16 = concat(axis = var_9073, interleave = var_9195_interleave_0, values = (var_9193_cast_fp16, var_9185_cast_fp16))[name = string("op_9195_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9196_cast_fp16 = mul(x = var_9195_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9196_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_141_cast_fp16 = add(x = var_9180_cast_fp16, y = var_9196_cast_fp16)[name = string("mh_q_141_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9198_cast_fp16 = mul(x = mh_k_93_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9198_cast_fp16")];
+            tensor<int32, [4]> var_9203_begin_0 = const()[name = string("op_9203_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9203_end_0 = const()[name = string("op_9203_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_9203_end_mask_0 = const()[name = string("op_9203_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9203_cast_fp16 = slice_by_index(begin = var_9203_begin_0, end = var_9203_end_0, end_mask = var_9203_end_mask_0, x = mh_k_93_cast_fp16)[name = string("op_9203_cast_fp16")];
+            tensor<int32, [4]> var_9209_begin_0 = const()[name = string("op_9209_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9209_end_0 = const()[name = string("op_9209_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_9209_end_mask_0 = const()[name = string("op_9209_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9209_cast_fp16 = slice_by_index(begin = var_9209_begin_0, end = var_9209_end_0, end_mask = var_9209_end_mask_0, x = mh_k_93_cast_fp16)[name = string("op_9209_cast_fp16")];
+            fp16 const_549_promoted_to_fp16 = const()[name = string("const_549_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_9211_cast_fp16 = mul(x = var_9209_cast_fp16, y = const_549_promoted_to_fp16)[name = string("op_9211_cast_fp16")];
+            bool var_9213_interleave_0 = const()[name = string("op_9213_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_9213_cast_fp16 = concat(axis = var_9073, interleave = var_9213_interleave_0, values = (var_9211_cast_fp16, var_9203_cast_fp16))[name = string("op_9213_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9214_cast_fp16 = mul(x = var_9213_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9214_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_95_cast_fp16 = add(x = var_9198_cast_fp16, y = var_9214_cast_fp16)[name = string("mh_k_95_cast_fp16")];
+            tensor<int32, [4]> var_9218 = const()[name = string("op_9218"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_95_cast_fp16 = reshape(shape = var_9218, x = mh_k_95_cast_fp16)[name = string("current_key_95_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9225_cast_fp16 = mul(x = var_101_cast_fp16_23, y = var_323_cast_fp16)[name = string("op_9225_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9226_cast_fp16 = mul(x = current_key_95_cast_fp16, y = var_321_cast_fp16)[name = string("op_9226_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_141_cast_fp16 = add(x = var_9225_cast_fp16, y = var_9226_cast_fp16)[name = string("key_141_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9229_cast_fp16 = mul(x = var_132_cast_fp16_23, y = var_323_cast_fp16)[name = string("op_9229_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9230_cast_fp16 = mul(x = current_value_47_cast_fp16, y = var_321_cast_fp16)[name = string("op_9230_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_93_cast_fp16 = add(x = var_9229_cast_fp16, y = var_9230_cast_fp16)[name = string("value_93_cast_fp16")];
+            tensor<int32, [4]> var_9234 = const()[name = string("op_9234"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_93_cast_fp16 = reshape(shape = var_9234, x = key_141_cast_fp16)[name = string("key_heads_93_cast_fp16")];
+            tensor<int32, [4]> var_9236 = const()[name = string("op_9236"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_93_cast_fp16 = reshape(shape = var_9236, x = value_93_cast_fp16)[name = string("value_heads_93_cast_fp16")];
+            tensor<int32, [4]> var_9239_begin_0 = const()[name = string("op_9239_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9239_end_0 = const()[name = string("op_9239_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9239_end_mask_0 = const()[name = string("op_9239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9239_cast_fp16 = slice_by_index(begin = var_9239_begin_0, end = var_9239_end_0, end_mask = var_9239_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9239_cast_fp16")];
+            tensor<int32, [4]> var_9243_begin_0 = const()[name = string("op_9243_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9243_end_0 = const()[name = string("op_9243_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9243_end_mask_0 = const()[name = string("op_9243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9243_cast_fp16 = slice_by_index(begin = var_9243_begin_0, end = var_9243_end_0, end_mask = var_9243_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9243_cast_fp16")];
+            tensor<int32, [4]> var_9255_begin_0 = const()[name = string("op_9255_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9255_end_0 = const()[name = string("op_9255_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9255_end_mask_0 = const()[name = string("op_9255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9255_cast_fp16 = slice_by_index(begin = var_9255_begin_0, end = var_9255_end_0, end_mask = var_9255_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9255_cast_fp16")];
+            tensor<int32, [4]> var_9259_begin_0 = const()[name = string("op_9259_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9259_end_0 = const()[name = string("op_9259_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9259_end_mask_0 = const()[name = string("op_9259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9259_cast_fp16 = slice_by_index(begin = var_9259_begin_0, end = var_9259_end_0, end_mask = var_9259_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9259_cast_fp16")];
+            tensor<int32, [4]> var_9271_begin_0 = const()[name = string("op_9271_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9271_end_0 = const()[name = string("op_9271_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9271_end_mask_0 = const()[name = string("op_9271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9271_cast_fp16 = slice_by_index(begin = var_9271_begin_0, end = var_9271_end_0, end_mask = var_9271_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9271_cast_fp16")];
+            tensor<int32, [4]> var_9275_begin_0 = const()[name = string("op_9275_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9275_end_0 = const()[name = string("op_9275_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9275_end_mask_0 = const()[name = string("op_9275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9275_cast_fp16 = slice_by_index(begin = var_9275_begin_0, end = var_9275_end_0, end_mask = var_9275_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9275_cast_fp16")];
+            tensor<int32, [4]> var_9287_begin_0 = const()[name = string("op_9287_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9287_end_0 = const()[name = string("op_9287_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9287_end_mask_0 = const()[name = string("op_9287_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9287_cast_fp16 = slice_by_index(begin = var_9287_begin_0, end = var_9287_end_0, end_mask = var_9287_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9287_cast_fp16")];
+            tensor<int32, [4]> var_9291_begin_0 = const()[name = string("op_9291_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9291_end_0 = const()[name = string("op_9291_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9291_end_mask_0 = const()[name = string("op_9291_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9291_cast_fp16 = slice_by_index(begin = var_9291_begin_0, end = var_9291_end_0, end_mask = var_9291_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9291_cast_fp16")];
+            tensor<int32, [4]> var_9303_begin_0 = const()[name = string("op_9303_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9303_end_0 = const()[name = string("op_9303_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9303_end_mask_0 = const()[name = string("op_9303_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9303_cast_fp16 = slice_by_index(begin = var_9303_begin_0, end = var_9303_end_0, end_mask = var_9303_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9303_cast_fp16")];
+            tensor<int32, [4]> var_9307_begin_0 = const()[name = string("op_9307_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9307_end_0 = const()[name = string("op_9307_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9307_end_mask_0 = const()[name = string("op_9307_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9307_cast_fp16 = slice_by_index(begin = var_9307_begin_0, end = var_9307_end_0, end_mask = var_9307_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9307_cast_fp16")];
+            tensor<int32, [4]> var_9319_begin_0 = const()[name = string("op_9319_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9319_end_0 = const()[name = string("op_9319_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9319_end_mask_0 = const()[name = string("op_9319_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9319_cast_fp16 = slice_by_index(begin = var_9319_begin_0, end = var_9319_end_0, end_mask = var_9319_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9319_cast_fp16")];
+            tensor<int32, [4]> var_9323_begin_0 = const()[name = string("op_9323_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9323_end_0 = const()[name = string("op_9323_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9323_end_mask_0 = const()[name = string("op_9323_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9323_cast_fp16 = slice_by_index(begin = var_9323_begin_0, end = var_9323_end_0, end_mask = var_9323_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9323_cast_fp16")];
+            tensor<int32, [4]> var_9335_begin_0 = const()[name = string("op_9335_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9335_end_0 = const()[name = string("op_9335_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9335_end_mask_0 = const()[name = string("op_9335_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9335_cast_fp16 = slice_by_index(begin = var_9335_begin_0, end = var_9335_end_0, end_mask = var_9335_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9335_cast_fp16")];
+            tensor<int32, [4]> var_9339_begin_0 = const()[name = string("op_9339_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9339_end_0 = const()[name = string("op_9339_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9339_end_mask_0 = const()[name = string("op_9339_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9339_cast_fp16 = slice_by_index(begin = var_9339_begin_0, end = var_9339_end_0, end_mask = var_9339_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9339_cast_fp16")];
+            tensor<int32, [4]> var_9351_begin_0 = const()[name = string("op_9351_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9351_end_0 = const()[name = string("op_9351_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9351_end_mask_0 = const()[name = string("op_9351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9351_cast_fp16 = slice_by_index(begin = var_9351_begin_0, end = var_9351_end_0, end_mask = var_9351_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9351_cast_fp16")];
+            tensor<int32, [4]> var_9355_begin_0 = const()[name = string("op_9355_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9355_end_0 = const()[name = string("op_9355_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9355_end_mask_0 = const()[name = string("op_9355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9355_cast_fp16 = slice_by_index(begin = var_9355_begin_0, end = var_9355_end_0, end_mask = var_9355_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9355_cast_fp16")];
+            bool key_heads_95_interleave_0 = const()[name = string("key_heads_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_95_cast_fp16 = concat(axis = var_9081, interleave = key_heads_95_interleave_0, values = (var_9239_cast_fp16, var_9239_cast_fp16, var_9255_cast_fp16, var_9255_cast_fp16, var_9271_cast_fp16, var_9271_cast_fp16, var_9287_cast_fp16, var_9287_cast_fp16, var_9303_cast_fp16, var_9303_cast_fp16, var_9319_cast_fp16, var_9319_cast_fp16, var_9335_cast_fp16, var_9335_cast_fp16, var_9351_cast_fp16, var_9351_cast_fp16))[name = string("key_heads_95_cast_fp16")];
+            bool value_heads_95_interleave_0 = const()[name = string("value_heads_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_95_cast_fp16 = concat(axis = var_9081, interleave = value_heads_95_interleave_0, values = (var_9243_cast_fp16, var_9243_cast_fp16, var_9259_cast_fp16, var_9259_cast_fp16, var_9275_cast_fp16, var_9275_cast_fp16, var_9291_cast_fp16, var_9291_cast_fp16, var_9307_cast_fp16, var_9307_cast_fp16, var_9323_cast_fp16, var_9323_cast_fp16, var_9339_cast_fp16, var_9339_cast_fp16, var_9355_cast_fp16, var_9355_cast_fp16))[name = string("value_heads_95_cast_fp16")];
+            fp16 var_9378_to_fp16 = const()[name = string("op_9378_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_9379_cast_fp16 = mul(x = mh_q_141_cast_fp16, y = var_9378_to_fp16)[name = string("op_9379_cast_fp16")];
+            bool mh_w_93_transpose_x_0 = const()[name = string("mh_w_93_transpose_x_0"), val = bool(true)];
+            bool mh_w_93_transpose_y_0 = const()[name = string("mh_w_93_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_93_cast_fp16 = matmul(transpose_x = mh_w_93_transpose_x_0, transpose_y = mh_w_93_transpose_y_0, x = var_9379_cast_fp16, y = key_heads_95_cast_fp16)[name = string("mh_w_93_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_95_cast_fp16 = add(x = mh_w_93_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_95_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_9391_cast_fp16 = softmax(axis = var_9063, x = mh_w_95_cast_fp16)[name = string("op_9391_cast_fp16")];
+            bool attn_47_transpose_x_0 = const()[name = string("attn_47_transpose_x_0"), val = bool(false)];
+            bool attn_47_transpose_y_0 = const()[name = string("attn_47_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_47_cast_fp16 = matmul(transpose_x = attn_47_transpose_x_0, transpose_y = attn_47_transpose_y_0, x = value_heads_95_cast_fp16, y = var_9391_cast_fp16)[name = string("attn_47_cast_fp16")];
+            tensor<int32, [4]> var_9396 = const()[name = string("op_9396"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_185_cast_fp16 = reshape(shape = var_9396, x = attn_47_cast_fp16)[name = string("input_185_cast_fp16")];
+            string obj_195_pad_type_0 = const()[name = string("obj_195_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_195_strides_0 = const()[name = string("obj_195_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_195_pad_0 = const()[name = string("obj_195_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_195_dilations_0 = const()[name = string("obj_195_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_195_groups_0 = const()[name = string("obj_195_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_23_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366246144))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(368343360))))[name = string("layers_23_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_195_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_195_dilations_0, groups = obj_195_groups_0, pad = obj_195_pad_0, pad_type = obj_195_pad_type_0, strides = obj_195_strides_0, weight = layers_23_self_attn_o_proj_weight_to_fp16_palettized, x = input_185_cast_fp16)[name = string("obj_195_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_189_cast_fp16 = add(x = inputs_183_cast_fp16, y = obj_195_cast_fp16)[name = string("inputs_189_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_191_cast_fp16 = mul(x = inputs_189_cast_fp16, y = inputs_189_cast_fp16)[name = string("inputs_sq_191_cast_fp16")];
+            tensor<int32, [1]> variance_191_axes_0 = const()[name = string("variance_191_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_191_keep_dims_0 = const()[name = string("variance_191_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_191_cast_fp16 = reduce_mean(axes = variance_191_axes_0, keep_dims = variance_191_keep_dims_0, x = inputs_sq_191_cast_fp16)[name = string("variance_191_cast_fp16")];
+            fp16 var_9414_to_fp16 = const()[name = string("op_9414_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9415_cast_fp16 = add(x = variance_191_cast_fp16, y = var_9414_to_fp16)[name = string("op_9415_cast_fp16")];
+            fp32 var_9416_epsilon_0 = const()[name = string("op_9416_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9416_cast_fp16 = rsqrt(epsilon = var_9416_epsilon_0, x = var_9415_cast_fp16)[name = string("op_9416_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_237_cast_fp16 = mul(x = inputs_189_cast_fp16, y = var_9416_cast_fp16)[name = string("hidden_states_237_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_191_to_fp16 = const()[name = string("w_191_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(368343936)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_187_cast_fp16 = mul(x = w_191_to_fp16, y = hidden_states_237_cast_fp16)[name = string("input_187_cast_fp16")];
+            string input_189_pad_type_0 = const()[name = string("input_189_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_189_strides_0 = const()[name = string("input_189_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_189_pad_0 = const()[name = string("input_189_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_189_dilations_0 = const()[name = string("input_189_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_189_groups_0 = const()[name = string("input_189_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_23_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(368346048))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(371491840))))[name = string("layers_23_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_189_cast_fp16 = conv(dilations = input_189_dilations_0, groups = input_189_groups_0, pad = input_189_pad_0, pad_type = input_189_pad_type_0, strides = input_189_strides_0, weight = layers_23_mlp_gate_proj_weight_to_fp16_palettized, x = input_187_cast_fp16)[name = string("input_189_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_9430_cast_fp16 = silu(x = input_189_cast_fp16)[name = string("op_9430_cast_fp16")];
+            string var_9436_pad_type_0 = const()[name = string("op_9436_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9436_strides_0 = const()[name = string("op_9436_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9436_pad_0 = const()[name = string("op_9436_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9436_dilations_0 = const()[name = string("op_9436_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9436_groups_0 = const()[name = string("op_9436_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_23_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(371492416))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(374638208))))[name = string("layers_23_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_9436_cast_fp16 = conv(dilations = var_9436_dilations_0, groups = var_9436_groups_0, pad = var_9436_pad_0, pad_type = var_9436_pad_type_0, strides = var_9436_strides_0, weight = layers_23_mlp_up_proj_weight_to_fp16_palettized, x = input_187_cast_fp16)[name = string("op_9436_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_191_cast_fp16 = mul(x = var_9430_cast_fp16, y = var_9436_cast_fp16)[name = string("input_191_cast_fp16")];
+            string hidden_states_239_pad_type_0 = const()[name = string("hidden_states_239_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_239_strides_0 = const()[name = string("hidden_states_239_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_239_pad_0 = const()[name = string("hidden_states_239_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_239_dilations_0 = const()[name = string("hidden_states_239_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_239_groups_0 = const()[name = string("hidden_states_239_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_23_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(374638784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(377784576))))[name = string("layers_23_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_239_cast_fp16 = conv(dilations = hidden_states_239_dilations_0, groups = hidden_states_239_groups_0, pad = hidden_states_239_pad_0, pad_type = hidden_states_239_pad_type_0, strides = hidden_states_239_strides_0, weight = layers_23_mlp_down_proj_weight_to_fp16_palettized, x = input_191_cast_fp16)[name = string("hidden_states_239_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_191_cast_fp16 = add(x = inputs_189_cast_fp16, y = hidden_states_239_cast_fp16)[name = string("inputs_191_cast_fp16")];
+            int32 var_9450 = const()[name = string("op_9450"), val = int32(3)];
+            int32 var_9460 = const()[name = string("op_9460"), val = int32(-2)];
+            int32 var_9468 = const()[name = string("op_9468"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_193_cast_fp16 = mul(x = inputs_191_cast_fp16, y = inputs_191_cast_fp16)[name = string("inputs_sq_193_cast_fp16")];
+            tensor<int32, [1]> variance_193_axes_0 = const()[name = string("variance_193_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_193_keep_dims_0 = const()[name = string("variance_193_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_193_cast_fp16 = reduce_mean(axes = variance_193_axes_0, keep_dims = variance_193_keep_dims_0, x = inputs_sq_193_cast_fp16)[name = string("variance_193_cast_fp16")];
+            fp16 var_9480_to_fp16 = const()[name = string("op_9480_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9481_cast_fp16 = add(x = variance_193_cast_fp16, y = var_9480_to_fp16)[name = string("op_9481_cast_fp16")];
+            fp32 var_9482_epsilon_0 = const()[name = string("op_9482_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9482_cast_fp16 = rsqrt(epsilon = var_9482_epsilon_0, x = var_9481_cast_fp16)[name = string("op_9482_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_241_cast_fp16 = mul(x = inputs_191_cast_fp16, y = var_9482_cast_fp16)[name = string("hidden_states_241_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_193_to_fp16 = const()[name = string("w_193_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(377785152)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_197_cast_fp16 = mul(x = w_193_to_fp16, y = hidden_states_241_cast_fp16)[name = string("obj_197_cast_fp16")];
+            string query_145_pad_type_0 = const()[name = string("query_145_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_145_strides_0 = const()[name = string("query_145_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_145_pad_0 = const()[name = string("query_145_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_145_dilations_0 = const()[name = string("query_145_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_145_groups_0 = const()[name = string("query_145_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_24_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(377787264))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(379884480))))[name = string("layers_24_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_145_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_145_dilations_0, groups = query_145_groups_0, pad = query_145_pad_0, pad_type = query_145_pad_type_0, strides = query_145_strides_0, weight = layers_24_self_attn_q_proj_weight_to_fp16_palettized, x = obj_197_cast_fp16)[name = string("query_145_cast_fp16")];
+            string current_key_97_pad_type_0 = const()[name = string("current_key_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_97_strides_0 = const()[name = string("current_key_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_97_pad_0 = const()[name = string("current_key_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_97_dilations_0 = const()[name = string("current_key_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_97_groups_0 = const()[name = string("current_key_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_24_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(379885056))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(380933696))))[name = string("layers_24_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_97_cast_fp16 = conv(dilations = current_key_97_dilations_0, groups = current_key_97_groups_0, pad = current_key_97_pad_0, pad_type = current_key_97_pad_type_0, strides = current_key_97_strides_0, weight = layers_24_self_attn_k_proj_weight_to_fp16_palettized, x = obj_197_cast_fp16)[name = string("current_key_97_cast_fp16")];
+            string current_value_49_pad_type_0 = const()[name = string("current_value_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_49_strides_0 = const()[name = string("current_value_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_49_pad_0 = const()[name = string("current_value_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_49_dilations_0 = const()[name = string("current_value_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_49_groups_0 = const()[name = string("current_value_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_24_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(380934272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(381982912))))[name = string("layers_24_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_49_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_49_dilations_0, groups = current_value_49_groups_0, pad = current_value_49_pad_0, pad_type = current_value_49_pad_type_0, strides = current_value_49_strides_0, weight = layers_24_self_attn_v_proj_weight_to_fp16_palettized, x = obj_197_cast_fp16)[name = string("current_value_49_cast_fp16")];
+            tensor<int32, [4]> var_9519 = const()[name = string("op_9519"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_193_cast_fp16 = reshape(shape = var_9519, x = query_145_cast_fp16)[name = string("inputs_193_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_195_cast_fp16 = mul(x = inputs_193_cast_fp16, y = inputs_193_cast_fp16)[name = string("inputs_sq_195_cast_fp16")];
+            tensor<int32, [1]> variance_195_axes_0 = const()[name = string("variance_195_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_195_keep_dims_0 = const()[name = string("variance_195_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_195_cast_fp16 = reduce_mean(axes = variance_195_axes_0, keep_dims = variance_195_keep_dims_0, x = inputs_sq_195_cast_fp16)[name = string("variance_195_cast_fp16")];
+            fp16 var_9525_to_fp16 = const()[name = string("op_9525_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_9526_cast_fp16 = add(x = variance_195_cast_fp16, y = var_9525_to_fp16)[name = string("op_9526_cast_fp16")];
+            fp32 var_9527_epsilon_0 = const()[name = string("op_9527_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_9527_cast_fp16 = rsqrt(epsilon = var_9527_epsilon_0, x = var_9526_cast_fp16)[name = string("op_9527_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_243_cast_fp16 = mul(x = inputs_193_cast_fp16, y = var_9527_cast_fp16)[name = string("hidden_states_243_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_195_to_fp16 = const()[name = string("w_195_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(381983488)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_49_cast_fp16 = mul(x = w_195_to_fp16, y = hidden_states_243_cast_fp16)[name = string("query_normed_49_cast_fp16")];
+            tensor<int32, [4]> var_9535 = const()[name = string("op_9535"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_195_cast_fp16 = reshape(shape = var_9535, x = current_key_97_cast_fp16)[name = string("inputs_195_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_197_cast_fp16 = mul(x = inputs_195_cast_fp16, y = inputs_195_cast_fp16)[name = string("inputs_sq_197_cast_fp16")];
+            tensor<int32, [1]> variance_197_axes_0 = const()[name = string("variance_197_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_197_keep_dims_0 = const()[name = string("variance_197_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_197_cast_fp16 = reduce_mean(axes = variance_197_axes_0, keep_dims = variance_197_keep_dims_0, x = inputs_sq_197_cast_fp16)[name = string("variance_197_cast_fp16")];
+            fp16 var_9541_to_fp16 = const()[name = string("op_9541_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_9542_cast_fp16 = add(x = variance_197_cast_fp16, y = var_9541_to_fp16)[name = string("op_9542_cast_fp16")];
+            fp32 var_9543_epsilon_0 = const()[name = string("op_9543_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_9543_cast_fp16 = rsqrt(epsilon = var_9543_epsilon_0, x = var_9542_cast_fp16)[name = string("op_9543_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_245_cast_fp16 = mul(x = inputs_195_cast_fp16, y = var_9543_cast_fp16)[name = string("hidden_states_245_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_197_to_fp16 = const()[name = string("w_197_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(381983808)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_49_cast_fp16 = mul(x = w_197_to_fp16, y = hidden_states_245_cast_fp16)[name = string("current_key_normed_49_cast_fp16")];
+            tensor<int32, [4]> var_9561 = const()[name = string("op_9561"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_145_cast_fp16 = reshape(shape = var_9561, x = query_normed_49_cast_fp16)[name = string("mh_q_145_cast_fp16")];
+            tensor<int32, [4]> var_9563 = const()[name = string("op_9563"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_97_cast_fp16 = reshape(shape = var_9563, x = current_key_normed_49_cast_fp16)[name = string("mh_k_97_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9567_cast_fp16 = mul(x = mh_q_145_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9567_cast_fp16")];
+            tensor<int32, [4]> var_9572_begin_0 = const()[name = string("op_9572_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9572_end_0 = const()[name = string("op_9572_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_9572_end_mask_0 = const()[name = string("op_9572_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9572_cast_fp16 = slice_by_index(begin = var_9572_begin_0, end = var_9572_end_0, end_mask = var_9572_end_mask_0, x = mh_q_145_cast_fp16)[name = string("op_9572_cast_fp16")];
+            tensor<int32, [4]> var_9578_begin_0 = const()[name = string("op_9578_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9578_end_0 = const()[name = string("op_9578_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_9578_end_mask_0 = const()[name = string("op_9578_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9578_cast_fp16 = slice_by_index(begin = var_9578_begin_0, end = var_9578_end_0, end_mask = var_9578_end_mask_0, x = mh_q_145_cast_fp16)[name = string("op_9578_cast_fp16")];
+            fp16 const_569_promoted_to_fp16 = const()[name = string("const_569_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_9580_cast_fp16 = mul(x = var_9578_cast_fp16, y = const_569_promoted_to_fp16)[name = string("op_9580_cast_fp16")];
+            bool var_9582_interleave_0 = const()[name = string("op_9582_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_9582_cast_fp16 = concat(axis = var_9460, interleave = var_9582_interleave_0, values = (var_9580_cast_fp16, var_9572_cast_fp16))[name = string("op_9582_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9583_cast_fp16 = mul(x = var_9582_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9583_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_147_cast_fp16 = add(x = var_9567_cast_fp16, y = var_9583_cast_fp16)[name = string("mh_q_147_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9585_cast_fp16 = mul(x = mh_k_97_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9585_cast_fp16")];
+            tensor<int32, [4]> var_9590_begin_0 = const()[name = string("op_9590_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9590_end_0 = const()[name = string("op_9590_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_9590_end_mask_0 = const()[name = string("op_9590_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9590_cast_fp16 = slice_by_index(begin = var_9590_begin_0, end = var_9590_end_0, end_mask = var_9590_end_mask_0, x = mh_k_97_cast_fp16)[name = string("op_9590_cast_fp16")];
+            tensor<int32, [4]> var_9596_begin_0 = const()[name = string("op_9596_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9596_end_0 = const()[name = string("op_9596_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_9596_end_mask_0 = const()[name = string("op_9596_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9596_cast_fp16 = slice_by_index(begin = var_9596_begin_0, end = var_9596_end_0, end_mask = var_9596_end_mask_0, x = mh_k_97_cast_fp16)[name = string("op_9596_cast_fp16")];
+            fp16 const_572_promoted_to_fp16 = const()[name = string("const_572_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_9598_cast_fp16 = mul(x = var_9596_cast_fp16, y = const_572_promoted_to_fp16)[name = string("op_9598_cast_fp16")];
+            bool var_9600_interleave_0 = const()[name = string("op_9600_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_9600_cast_fp16 = concat(axis = var_9460, interleave = var_9600_interleave_0, values = (var_9598_cast_fp16, var_9590_cast_fp16))[name = string("op_9600_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9601_cast_fp16 = mul(x = var_9600_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9601_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_99_cast_fp16 = add(x = var_9585_cast_fp16, y = var_9601_cast_fp16)[name = string("mh_k_99_cast_fp16")];
+            tensor<int32, [4]> var_9605 = const()[name = string("op_9605"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_99_cast_fp16 = reshape(shape = var_9605, x = mh_k_99_cast_fp16)[name = string("current_key_99_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9612_cast_fp16 = mul(x = var_101_cast_fp16_24, y = var_323_cast_fp16)[name = string("op_9612_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9613_cast_fp16 = mul(x = current_key_99_cast_fp16, y = var_321_cast_fp16)[name = string("op_9613_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_147_cast_fp16 = add(x = var_9612_cast_fp16, y = var_9613_cast_fp16)[name = string("key_147_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9616_cast_fp16 = mul(x = var_132_cast_fp16_24, y = var_323_cast_fp16)[name = string("op_9616_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9617_cast_fp16 = mul(x = current_value_49_cast_fp16, y = var_321_cast_fp16)[name = string("op_9617_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_97_cast_fp16 = add(x = var_9616_cast_fp16, y = var_9617_cast_fp16)[name = string("value_97_cast_fp16")];
+            tensor<int32, [4]> var_9621 = const()[name = string("op_9621"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_97_cast_fp16 = reshape(shape = var_9621, x = key_147_cast_fp16)[name = string("key_heads_97_cast_fp16")];
+            tensor<int32, [4]> var_9623 = const()[name = string("op_9623"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_97_cast_fp16 = reshape(shape = var_9623, x = value_97_cast_fp16)[name = string("value_heads_97_cast_fp16")];
+            tensor<int32, [4]> var_9626_begin_0 = const()[name = string("op_9626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9626_end_0 = const()[name = string("op_9626_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9626_end_mask_0 = const()[name = string("op_9626_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9626_cast_fp16 = slice_by_index(begin = var_9626_begin_0, end = var_9626_end_0, end_mask = var_9626_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9626_cast_fp16")];
+            tensor<int32, [4]> var_9630_begin_0 = const()[name = string("op_9630_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9630_end_0 = const()[name = string("op_9630_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9630_end_mask_0 = const()[name = string("op_9630_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9630_cast_fp16 = slice_by_index(begin = var_9630_begin_0, end = var_9630_end_0, end_mask = var_9630_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9630_cast_fp16")];
+            tensor<int32, [4]> var_9642_begin_0 = const()[name = string("op_9642_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9642_end_0 = const()[name = string("op_9642_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9642_end_mask_0 = const()[name = string("op_9642_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9642_cast_fp16 = slice_by_index(begin = var_9642_begin_0, end = var_9642_end_0, end_mask = var_9642_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9642_cast_fp16")];
+            tensor<int32, [4]> var_9646_begin_0 = const()[name = string("op_9646_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9646_end_0 = const()[name = string("op_9646_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9646_end_mask_0 = const()[name = string("op_9646_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9646_cast_fp16 = slice_by_index(begin = var_9646_begin_0, end = var_9646_end_0, end_mask = var_9646_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9646_cast_fp16")];
+            tensor<int32, [4]> var_9658_begin_0 = const()[name = string("op_9658_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9658_end_0 = const()[name = string("op_9658_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9658_end_mask_0 = const()[name = string("op_9658_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9658_cast_fp16 = slice_by_index(begin = var_9658_begin_0, end = var_9658_end_0, end_mask = var_9658_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9658_cast_fp16")];
+            tensor<int32, [4]> var_9662_begin_0 = const()[name = string("op_9662_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9662_end_0 = const()[name = string("op_9662_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9662_end_mask_0 = const()[name = string("op_9662_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9662_cast_fp16 = slice_by_index(begin = var_9662_begin_0, end = var_9662_end_0, end_mask = var_9662_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9662_cast_fp16")];
+            tensor<int32, [4]> var_9674_begin_0 = const()[name = string("op_9674_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9674_end_0 = const()[name = string("op_9674_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9674_end_mask_0 = const()[name = string("op_9674_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9674_cast_fp16 = slice_by_index(begin = var_9674_begin_0, end = var_9674_end_0, end_mask = var_9674_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9674_cast_fp16")];
+            tensor<int32, [4]> var_9678_begin_0 = const()[name = string("op_9678_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9678_end_0 = const()[name = string("op_9678_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9678_end_mask_0 = const()[name = string("op_9678_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9678_cast_fp16 = slice_by_index(begin = var_9678_begin_0, end = var_9678_end_0, end_mask = var_9678_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9678_cast_fp16")];
+            tensor<int32, [4]> var_9690_begin_0 = const()[name = string("op_9690_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9690_end_0 = const()[name = string("op_9690_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9690_end_mask_0 = const()[name = string("op_9690_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9690_cast_fp16 = slice_by_index(begin = var_9690_begin_0, end = var_9690_end_0, end_mask = var_9690_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9690_cast_fp16")];
+            tensor<int32, [4]> var_9694_begin_0 = const()[name = string("op_9694_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9694_end_0 = const()[name = string("op_9694_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9694_end_mask_0 = const()[name = string("op_9694_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9694_cast_fp16 = slice_by_index(begin = var_9694_begin_0, end = var_9694_end_0, end_mask = var_9694_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9694_cast_fp16")];
+            tensor<int32, [4]> var_9706_begin_0 = const()[name = string("op_9706_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9706_end_0 = const()[name = string("op_9706_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9706_end_mask_0 = const()[name = string("op_9706_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9706_cast_fp16 = slice_by_index(begin = var_9706_begin_0, end = var_9706_end_0, end_mask = var_9706_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9706_cast_fp16")];
+            tensor<int32, [4]> var_9710_begin_0 = const()[name = string("op_9710_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9710_end_0 = const()[name = string("op_9710_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9710_end_mask_0 = const()[name = string("op_9710_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9710_cast_fp16 = slice_by_index(begin = var_9710_begin_0, end = var_9710_end_0, end_mask = var_9710_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9710_cast_fp16")];
+            tensor<int32, [4]> var_9722_begin_0 = const()[name = string("op_9722_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9722_end_0 = const()[name = string("op_9722_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9722_end_mask_0 = const()[name = string("op_9722_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9722_cast_fp16 = slice_by_index(begin = var_9722_begin_0, end = var_9722_end_0, end_mask = var_9722_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9722_cast_fp16")];
+            tensor<int32, [4]> var_9726_begin_0 = const()[name = string("op_9726_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9726_end_0 = const()[name = string("op_9726_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9726_end_mask_0 = const()[name = string("op_9726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9726_cast_fp16 = slice_by_index(begin = var_9726_begin_0, end = var_9726_end_0, end_mask = var_9726_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9726_cast_fp16")];
+            tensor<int32, [4]> var_9738_begin_0 = const()[name = string("op_9738_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9738_end_0 = const()[name = string("op_9738_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9738_end_mask_0 = const()[name = string("op_9738_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9738_cast_fp16 = slice_by_index(begin = var_9738_begin_0, end = var_9738_end_0, end_mask = var_9738_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9738_cast_fp16")];
+            tensor<int32, [4]> var_9742_begin_0 = const()[name = string("op_9742_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9742_end_0 = const()[name = string("op_9742_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9742_end_mask_0 = const()[name = string("op_9742_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9742_cast_fp16 = slice_by_index(begin = var_9742_begin_0, end = var_9742_end_0, end_mask = var_9742_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9742_cast_fp16")];
+            bool key_heads_99_interleave_0 = const()[name = string("key_heads_99_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_99_cast_fp16 = concat(axis = var_9468, interleave = key_heads_99_interleave_0, values = (var_9626_cast_fp16, var_9626_cast_fp16, var_9642_cast_fp16, var_9642_cast_fp16, var_9658_cast_fp16, var_9658_cast_fp16, var_9674_cast_fp16, var_9674_cast_fp16, var_9690_cast_fp16, var_9690_cast_fp16, var_9706_cast_fp16, var_9706_cast_fp16, var_9722_cast_fp16, var_9722_cast_fp16, var_9738_cast_fp16, var_9738_cast_fp16))[name = string("key_heads_99_cast_fp16")];
+            bool value_heads_99_interleave_0 = const()[name = string("value_heads_99_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_99_cast_fp16 = concat(axis = var_9468, interleave = value_heads_99_interleave_0, values = (var_9630_cast_fp16, var_9630_cast_fp16, var_9646_cast_fp16, var_9646_cast_fp16, var_9662_cast_fp16, var_9662_cast_fp16, var_9678_cast_fp16, var_9678_cast_fp16, var_9694_cast_fp16, var_9694_cast_fp16, var_9710_cast_fp16, var_9710_cast_fp16, var_9726_cast_fp16, var_9726_cast_fp16, var_9742_cast_fp16, var_9742_cast_fp16))[name = string("value_heads_99_cast_fp16")];
+            fp16 var_9765_to_fp16 = const()[name = string("op_9765_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_9766_cast_fp16 = mul(x = mh_q_147_cast_fp16, y = var_9765_to_fp16)[name = string("op_9766_cast_fp16")];
+            bool mh_w_97_transpose_x_0 = const()[name = string("mh_w_97_transpose_x_0"), val = bool(true)];
+            bool mh_w_97_transpose_y_0 = const()[name = string("mh_w_97_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_97_cast_fp16 = matmul(transpose_x = mh_w_97_transpose_x_0, transpose_y = mh_w_97_transpose_y_0, x = var_9766_cast_fp16, y = key_heads_99_cast_fp16)[name = string("mh_w_97_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_99_cast_fp16 = add(x = mh_w_97_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_99_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_9778_cast_fp16 = softmax(axis = var_9450, x = mh_w_99_cast_fp16)[name = string("op_9778_cast_fp16")];
+            bool attn_49_transpose_x_0 = const()[name = string("attn_49_transpose_x_0"), val = bool(false)];
+            bool attn_49_transpose_y_0 = const()[name = string("attn_49_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_49_cast_fp16 = matmul(transpose_x = attn_49_transpose_x_0, transpose_y = attn_49_transpose_y_0, x = value_heads_99_cast_fp16, y = var_9778_cast_fp16)[name = string("attn_49_cast_fp16")];
+            tensor<int32, [4]> var_9783 = const()[name = string("op_9783"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_193_cast_fp16 = reshape(shape = var_9783, x = attn_49_cast_fp16)[name = string("input_193_cast_fp16")];
+            string obj_203_pad_type_0 = const()[name = string("obj_203_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_203_strides_0 = const()[name = string("obj_203_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_203_pad_0 = const()[name = string("obj_203_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_203_dilations_0 = const()[name = string("obj_203_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_203_groups_0 = const()[name = string("obj_203_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_24_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(381984128))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(384081344))))[name = string("layers_24_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_203_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_203_dilations_0, groups = obj_203_groups_0, pad = obj_203_pad_0, pad_type = obj_203_pad_type_0, strides = obj_203_strides_0, weight = layers_24_self_attn_o_proj_weight_to_fp16_palettized, x = input_193_cast_fp16)[name = string("obj_203_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_197_cast_fp16 = add(x = inputs_191_cast_fp16, y = obj_203_cast_fp16)[name = string("inputs_197_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_199_cast_fp16 = mul(x = inputs_197_cast_fp16, y = inputs_197_cast_fp16)[name = string("inputs_sq_199_cast_fp16")];
+            tensor<int32, [1]> variance_199_axes_0 = const()[name = string("variance_199_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_199_keep_dims_0 = const()[name = string("variance_199_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_199_cast_fp16 = reduce_mean(axes = variance_199_axes_0, keep_dims = variance_199_keep_dims_0, x = inputs_sq_199_cast_fp16)[name = string("variance_199_cast_fp16")];
+            fp16 var_9801_to_fp16 = const()[name = string("op_9801_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9802_cast_fp16 = add(x = variance_199_cast_fp16, y = var_9801_to_fp16)[name = string("op_9802_cast_fp16")];
+            fp32 var_9803_epsilon_0 = const()[name = string("op_9803_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9803_cast_fp16 = rsqrt(epsilon = var_9803_epsilon_0, x = var_9802_cast_fp16)[name = string("op_9803_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_247_cast_fp16 = mul(x = inputs_197_cast_fp16, y = var_9803_cast_fp16)[name = string("hidden_states_247_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_199_to_fp16 = const()[name = string("w_199_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(384081920)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_195_cast_fp16 = mul(x = w_199_to_fp16, y = hidden_states_247_cast_fp16)[name = string("input_195_cast_fp16")];
+            string input_197_pad_type_0 = const()[name = string("input_197_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_197_strides_0 = const()[name = string("input_197_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_197_pad_0 = const()[name = string("input_197_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_197_dilations_0 = const()[name = string("input_197_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_197_groups_0 = const()[name = string("input_197_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_24_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(384084032))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(387229824))))[name = string("layers_24_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_197_cast_fp16 = conv(dilations = input_197_dilations_0, groups = input_197_groups_0, pad = input_197_pad_0, pad_type = input_197_pad_type_0, strides = input_197_strides_0, weight = layers_24_mlp_gate_proj_weight_to_fp16_palettized, x = input_195_cast_fp16)[name = string("input_197_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_9817_cast_fp16 = silu(x = input_197_cast_fp16)[name = string("op_9817_cast_fp16")];
+            string var_9823_pad_type_0 = const()[name = string("op_9823_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9823_strides_0 = const()[name = string("op_9823_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9823_pad_0 = const()[name = string("op_9823_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9823_dilations_0 = const()[name = string("op_9823_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9823_groups_0 = const()[name = string("op_9823_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_24_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(387230400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(390376192))))[name = string("layers_24_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_9823_cast_fp16 = conv(dilations = var_9823_dilations_0, groups = var_9823_groups_0, pad = var_9823_pad_0, pad_type = var_9823_pad_type_0, strides = var_9823_strides_0, weight = layers_24_mlp_up_proj_weight_to_fp16_palettized, x = input_195_cast_fp16)[name = string("op_9823_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_199_cast_fp16 = mul(x = var_9817_cast_fp16, y = var_9823_cast_fp16)[name = string("input_199_cast_fp16")];
+            string hidden_states_249_pad_type_0 = const()[name = string("hidden_states_249_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_249_strides_0 = const()[name = string("hidden_states_249_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_249_pad_0 = const()[name = string("hidden_states_249_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_249_dilations_0 = const()[name = string("hidden_states_249_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_249_groups_0 = const()[name = string("hidden_states_249_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_24_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(390376768))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(393522560))))[name = string("layers_24_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_249_cast_fp16 = conv(dilations = hidden_states_249_dilations_0, groups = hidden_states_249_groups_0, pad = hidden_states_249_pad_0, pad_type = hidden_states_249_pad_type_0, strides = hidden_states_249_strides_0, weight = layers_24_mlp_down_proj_weight_to_fp16_palettized, x = input_199_cast_fp16)[name = string("hidden_states_249_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_199_cast_fp16 = add(x = inputs_197_cast_fp16, y = hidden_states_249_cast_fp16)[name = string("inputs_199_cast_fp16")];
+            int32 var_9837 = const()[name = string("op_9837"), val = int32(3)];
+            int32 var_9847 = const()[name = string("op_9847"), val = int32(-2)];
+            int32 var_9855 = const()[name = string("op_9855"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_201_cast_fp16 = mul(x = inputs_199_cast_fp16, y = inputs_199_cast_fp16)[name = string("inputs_sq_201_cast_fp16")];
+            tensor<int32, [1]> variance_201_axes_0 = const()[name = string("variance_201_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_201_keep_dims_0 = const()[name = string("variance_201_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_201_cast_fp16 = reduce_mean(axes = variance_201_axes_0, keep_dims = variance_201_keep_dims_0, x = inputs_sq_201_cast_fp16)[name = string("variance_201_cast_fp16")];
+            fp16 var_9867_to_fp16 = const()[name = string("op_9867_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9868_cast_fp16 = add(x = variance_201_cast_fp16, y = var_9867_to_fp16)[name = string("op_9868_cast_fp16")];
+            fp32 var_9869_epsilon_0 = const()[name = string("op_9869_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9869_cast_fp16 = rsqrt(epsilon = var_9869_epsilon_0, x = var_9868_cast_fp16)[name = string("op_9869_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_251_cast_fp16 = mul(x = inputs_199_cast_fp16, y = var_9869_cast_fp16)[name = string("hidden_states_251_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_201_to_fp16 = const()[name = string("w_201_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(393523136)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_205_cast_fp16 = mul(x = w_201_to_fp16, y = hidden_states_251_cast_fp16)[name = string("obj_205_cast_fp16")];
+            string query_151_pad_type_0 = const()[name = string("query_151_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_151_strides_0 = const()[name = string("query_151_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_151_pad_0 = const()[name = string("query_151_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_151_dilations_0 = const()[name = string("query_151_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_151_groups_0 = const()[name = string("query_151_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_25_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(393525248))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395622464))))[name = string("layers_25_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_151_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_151_dilations_0, groups = query_151_groups_0, pad = query_151_pad_0, pad_type = query_151_pad_type_0, strides = query_151_strides_0, weight = layers_25_self_attn_q_proj_weight_to_fp16_palettized, x = obj_205_cast_fp16)[name = string("query_151_cast_fp16")];
+            string current_key_101_pad_type_0 = const()[name = string("current_key_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_101_strides_0 = const()[name = string("current_key_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_101_pad_0 = const()[name = string("current_key_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_101_dilations_0 = const()[name = string("current_key_101_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_101_groups_0 = const()[name = string("current_key_101_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_25_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395623040))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396671680))))[name = string("layers_25_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_101_cast_fp16 = conv(dilations = current_key_101_dilations_0, groups = current_key_101_groups_0, pad = current_key_101_pad_0, pad_type = current_key_101_pad_type_0, strides = current_key_101_strides_0, weight = layers_25_self_attn_k_proj_weight_to_fp16_palettized, x = obj_205_cast_fp16)[name = string("current_key_101_cast_fp16")];
+            string current_value_51_pad_type_0 = const()[name = string("current_value_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_51_strides_0 = const()[name = string("current_value_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_51_pad_0 = const()[name = string("current_value_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_51_dilations_0 = const()[name = string("current_value_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_51_groups_0 = const()[name = string("current_value_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_25_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396672256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397720896))))[name = string("layers_25_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_51_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_51_dilations_0, groups = current_value_51_groups_0, pad = current_value_51_pad_0, pad_type = current_value_51_pad_type_0, strides = current_value_51_strides_0, weight = layers_25_self_attn_v_proj_weight_to_fp16_palettized, x = obj_205_cast_fp16)[name = string("current_value_51_cast_fp16")];
+            tensor<int32, [4]> var_9906 = const()[name = string("op_9906"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_201_cast_fp16 = reshape(shape = var_9906, x = query_151_cast_fp16)[name = string("inputs_201_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_203_cast_fp16 = mul(x = inputs_201_cast_fp16, y = inputs_201_cast_fp16)[name = string("inputs_sq_203_cast_fp16")];
+            tensor<int32, [1]> variance_203_axes_0 = const()[name = string("variance_203_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_203_keep_dims_0 = const()[name = string("variance_203_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_203_cast_fp16 = reduce_mean(axes = variance_203_axes_0, keep_dims = variance_203_keep_dims_0, x = inputs_sq_203_cast_fp16)[name = string("variance_203_cast_fp16")];
+            fp16 var_9912_to_fp16 = const()[name = string("op_9912_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_9913_cast_fp16 = add(x = variance_203_cast_fp16, y = var_9912_to_fp16)[name = string("op_9913_cast_fp16")];
+            fp32 var_9914_epsilon_0 = const()[name = string("op_9914_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_9914_cast_fp16 = rsqrt(epsilon = var_9914_epsilon_0, x = var_9913_cast_fp16)[name = string("op_9914_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_253_cast_fp16 = mul(x = inputs_201_cast_fp16, y = var_9914_cast_fp16)[name = string("hidden_states_253_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_203_to_fp16 = const()[name = string("w_203_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397721472)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_51_cast_fp16 = mul(x = w_203_to_fp16, y = hidden_states_253_cast_fp16)[name = string("query_normed_51_cast_fp16")];
+            tensor<int32, [4]> var_9922 = const()[name = string("op_9922"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_203_cast_fp16 = reshape(shape = var_9922, x = current_key_101_cast_fp16)[name = string("inputs_203_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_205_cast_fp16 = mul(x = inputs_203_cast_fp16, y = inputs_203_cast_fp16)[name = string("inputs_sq_205_cast_fp16")];
+            tensor<int32, [1]> variance_205_axes_0 = const()[name = string("variance_205_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_205_keep_dims_0 = const()[name = string("variance_205_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_205_cast_fp16 = reduce_mean(axes = variance_205_axes_0, keep_dims = variance_205_keep_dims_0, x = inputs_sq_205_cast_fp16)[name = string("variance_205_cast_fp16")];
+            fp16 var_9928_to_fp16 = const()[name = string("op_9928_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_9929_cast_fp16 = add(x = variance_205_cast_fp16, y = var_9928_to_fp16)[name = string("op_9929_cast_fp16")];
+            fp32 var_9930_epsilon_0 = const()[name = string("op_9930_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_9930_cast_fp16 = rsqrt(epsilon = var_9930_epsilon_0, x = var_9929_cast_fp16)[name = string("op_9930_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_255_cast_fp16 = mul(x = inputs_203_cast_fp16, y = var_9930_cast_fp16)[name = string("hidden_states_255_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_205_to_fp16 = const()[name = string("w_205_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397721792)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_51_cast_fp16 = mul(x = w_205_to_fp16, y = hidden_states_255_cast_fp16)[name = string("current_key_normed_51_cast_fp16")];
+            tensor<int32, [4]> var_9948 = const()[name = string("op_9948"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_151_cast_fp16 = reshape(shape = var_9948, x = query_normed_51_cast_fp16)[name = string("mh_q_151_cast_fp16")];
+            tensor<int32, [4]> var_9950 = const()[name = string("op_9950"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_101_cast_fp16 = reshape(shape = var_9950, x = current_key_normed_51_cast_fp16)[name = string("mh_k_101_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9954_cast_fp16 = mul(x = mh_q_151_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9954_cast_fp16")];
+            tensor<int32, [4]> var_9959_begin_0 = const()[name = string("op_9959_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9959_end_0 = const()[name = string("op_9959_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_9959_end_mask_0 = const()[name = string("op_9959_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9959_cast_fp16 = slice_by_index(begin = var_9959_begin_0, end = var_9959_end_0, end_mask = var_9959_end_mask_0, x = mh_q_151_cast_fp16)[name = string("op_9959_cast_fp16")];
+            tensor<int32, [4]> var_9965_begin_0 = const()[name = string("op_9965_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9965_end_0 = const()[name = string("op_9965_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_9965_end_mask_0 = const()[name = string("op_9965_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9965_cast_fp16 = slice_by_index(begin = var_9965_begin_0, end = var_9965_end_0, end_mask = var_9965_end_mask_0, x = mh_q_151_cast_fp16)[name = string("op_9965_cast_fp16")];
+            fp16 const_592_promoted_to_fp16 = const()[name = string("const_592_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_9967_cast_fp16 = mul(x = var_9965_cast_fp16, y = const_592_promoted_to_fp16)[name = string("op_9967_cast_fp16")];
+            bool var_9969_interleave_0 = const()[name = string("op_9969_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_9969_cast_fp16 = concat(axis = var_9847, interleave = var_9969_interleave_0, values = (var_9967_cast_fp16, var_9959_cast_fp16))[name = string("op_9969_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9970_cast_fp16 = mul(x = var_9969_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9970_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_153_cast_fp16 = add(x = var_9954_cast_fp16, y = var_9970_cast_fp16)[name = string("mh_q_153_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9972_cast_fp16 = mul(x = mh_k_101_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9972_cast_fp16")];
+            tensor<int32, [4]> var_9977_begin_0 = const()[name = string("op_9977_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9977_end_0 = const()[name = string("op_9977_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_9977_end_mask_0 = const()[name = string("op_9977_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9977_cast_fp16 = slice_by_index(begin = var_9977_begin_0, end = var_9977_end_0, end_mask = var_9977_end_mask_0, x = mh_k_101_cast_fp16)[name = string("op_9977_cast_fp16")];
+            tensor<int32, [4]> var_9983_begin_0 = const()[name = string("op_9983_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9983_end_0 = const()[name = string("op_9983_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_9983_end_mask_0 = const()[name = string("op_9983_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9983_cast_fp16 = slice_by_index(begin = var_9983_begin_0, end = var_9983_end_0, end_mask = var_9983_end_mask_0, x = mh_k_101_cast_fp16)[name = string("op_9983_cast_fp16")];
+            fp16 const_595_promoted_to_fp16 = const()[name = string("const_595_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_9985_cast_fp16 = mul(x = var_9983_cast_fp16, y = const_595_promoted_to_fp16)[name = string("op_9985_cast_fp16")];
+            bool var_9987_interleave_0 = const()[name = string("op_9987_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_9987_cast_fp16 = concat(axis = var_9847, interleave = var_9987_interleave_0, values = (var_9985_cast_fp16, var_9977_cast_fp16))[name = string("op_9987_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9988_cast_fp16 = mul(x = var_9987_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9988_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_103_cast_fp16 = add(x = var_9972_cast_fp16, y = var_9988_cast_fp16)[name = string("mh_k_103_cast_fp16")];
+            tensor<int32, [4]> var_9992 = const()[name = string("op_9992"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_103_cast_fp16 = reshape(shape = var_9992, x = mh_k_103_cast_fp16)[name = string("current_key_103_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9999_cast_fp16 = mul(x = var_101_cast_fp16_25, y = var_323_cast_fp16)[name = string("op_9999_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10000_cast_fp16 = mul(x = current_key_103_cast_fp16, y = var_321_cast_fp16)[name = string("op_10000_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_153_cast_fp16 = add(x = var_9999_cast_fp16, y = var_10000_cast_fp16)[name = string("key_153_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10003_cast_fp16 = mul(x = var_132_cast_fp16_25, y = var_323_cast_fp16)[name = string("op_10003_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10004_cast_fp16 = mul(x = current_value_51_cast_fp16, y = var_321_cast_fp16)[name = string("op_10004_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_101_cast_fp16 = add(x = var_10003_cast_fp16, y = var_10004_cast_fp16)[name = string("value_101_cast_fp16")];
+            tensor<int32, [4]> var_10008 = const()[name = string("op_10008"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_101_cast_fp16 = reshape(shape = var_10008, x = key_153_cast_fp16)[name = string("key_heads_101_cast_fp16")];
+            tensor<int32, [4]> var_10010 = const()[name = string("op_10010"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_101_cast_fp16 = reshape(shape = var_10010, x = value_101_cast_fp16)[name = string("value_heads_101_cast_fp16")];
+            tensor<int32, [4]> var_10013_begin_0 = const()[name = string("op_10013_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10013_end_0 = const()[name = string("op_10013_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10013_end_mask_0 = const()[name = string("op_10013_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10013_cast_fp16 = slice_by_index(begin = var_10013_begin_0, end = var_10013_end_0, end_mask = var_10013_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10013_cast_fp16")];
+            tensor<int32, [4]> var_10017_begin_0 = const()[name = string("op_10017_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10017_end_0 = const()[name = string("op_10017_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10017_end_mask_0 = const()[name = string("op_10017_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10017_cast_fp16 = slice_by_index(begin = var_10017_begin_0, end = var_10017_end_0, end_mask = var_10017_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10017_cast_fp16")];
+            tensor<int32, [4]> var_10029_begin_0 = const()[name = string("op_10029_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10029_end_0 = const()[name = string("op_10029_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10029_end_mask_0 = const()[name = string("op_10029_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10029_cast_fp16 = slice_by_index(begin = var_10029_begin_0, end = var_10029_end_0, end_mask = var_10029_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10029_cast_fp16")];
+            tensor<int32, [4]> var_10033_begin_0 = const()[name = string("op_10033_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10033_end_0 = const()[name = string("op_10033_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10033_end_mask_0 = const()[name = string("op_10033_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10033_cast_fp16 = slice_by_index(begin = var_10033_begin_0, end = var_10033_end_0, end_mask = var_10033_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10033_cast_fp16")];
+            tensor<int32, [4]> var_10045_begin_0 = const()[name = string("op_10045_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10045_end_0 = const()[name = string("op_10045_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10045_end_mask_0 = const()[name = string("op_10045_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10045_cast_fp16 = slice_by_index(begin = var_10045_begin_0, end = var_10045_end_0, end_mask = var_10045_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10045_cast_fp16")];
+            tensor<int32, [4]> var_10049_begin_0 = const()[name = string("op_10049_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10049_end_0 = const()[name = string("op_10049_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10049_end_mask_0 = const()[name = string("op_10049_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10049_cast_fp16 = slice_by_index(begin = var_10049_begin_0, end = var_10049_end_0, end_mask = var_10049_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10049_cast_fp16")];
+            tensor<int32, [4]> var_10061_begin_0 = const()[name = string("op_10061_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10061_end_0 = const()[name = string("op_10061_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10061_end_mask_0 = const()[name = string("op_10061_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10061_cast_fp16 = slice_by_index(begin = var_10061_begin_0, end = var_10061_end_0, end_mask = var_10061_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10061_cast_fp16")];
+            tensor<int32, [4]> var_10065_begin_0 = const()[name = string("op_10065_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10065_end_0 = const()[name = string("op_10065_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10065_end_mask_0 = const()[name = string("op_10065_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10065_cast_fp16 = slice_by_index(begin = var_10065_begin_0, end = var_10065_end_0, end_mask = var_10065_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10065_cast_fp16")];
+            tensor<int32, [4]> var_10077_begin_0 = const()[name = string("op_10077_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10077_end_0 = const()[name = string("op_10077_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10077_end_mask_0 = const()[name = string("op_10077_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10077_cast_fp16 = slice_by_index(begin = var_10077_begin_0, end = var_10077_end_0, end_mask = var_10077_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10077_cast_fp16")];
+            tensor<int32, [4]> var_10081_begin_0 = const()[name = string("op_10081_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10081_end_0 = const()[name = string("op_10081_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10081_end_mask_0 = const()[name = string("op_10081_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10081_cast_fp16 = slice_by_index(begin = var_10081_begin_0, end = var_10081_end_0, end_mask = var_10081_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10081_cast_fp16")];
+            tensor<int32, [4]> var_10093_begin_0 = const()[name = string("op_10093_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10093_end_0 = const()[name = string("op_10093_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10093_end_mask_0 = const()[name = string("op_10093_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10093_cast_fp16 = slice_by_index(begin = var_10093_begin_0, end = var_10093_end_0, end_mask = var_10093_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10093_cast_fp16")];
+            tensor<int32, [4]> var_10097_begin_0 = const()[name = string("op_10097_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10097_end_0 = const()[name = string("op_10097_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10097_end_mask_0 = const()[name = string("op_10097_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10097_cast_fp16 = slice_by_index(begin = var_10097_begin_0, end = var_10097_end_0, end_mask = var_10097_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10097_cast_fp16")];
+            tensor<int32, [4]> var_10109_begin_0 = const()[name = string("op_10109_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10109_end_0 = const()[name = string("op_10109_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10109_end_mask_0 = const()[name = string("op_10109_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10109_cast_fp16 = slice_by_index(begin = var_10109_begin_0, end = var_10109_end_0, end_mask = var_10109_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10109_cast_fp16")];
+            tensor<int32, [4]> var_10113_begin_0 = const()[name = string("op_10113_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10113_end_0 = const()[name = string("op_10113_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10113_end_mask_0 = const()[name = string("op_10113_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10113_cast_fp16 = slice_by_index(begin = var_10113_begin_0, end = var_10113_end_0, end_mask = var_10113_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10113_cast_fp16")];
+            tensor<int32, [4]> var_10125_begin_0 = const()[name = string("op_10125_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10125_end_0 = const()[name = string("op_10125_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10125_end_mask_0 = const()[name = string("op_10125_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10125_cast_fp16 = slice_by_index(begin = var_10125_begin_0, end = var_10125_end_0, end_mask = var_10125_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10125_cast_fp16")];
+            tensor<int32, [4]> var_10129_begin_0 = const()[name = string("op_10129_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10129_end_0 = const()[name = string("op_10129_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10129_end_mask_0 = const()[name = string("op_10129_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10129_cast_fp16 = slice_by_index(begin = var_10129_begin_0, end = var_10129_end_0, end_mask = var_10129_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10129_cast_fp16")];
+            bool key_heads_103_interleave_0 = const()[name = string("key_heads_103_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_103_cast_fp16 = concat(axis = var_9855, interleave = key_heads_103_interleave_0, values = (var_10013_cast_fp16, var_10013_cast_fp16, var_10029_cast_fp16, var_10029_cast_fp16, var_10045_cast_fp16, var_10045_cast_fp16, var_10061_cast_fp16, var_10061_cast_fp16, var_10077_cast_fp16, var_10077_cast_fp16, var_10093_cast_fp16, var_10093_cast_fp16, var_10109_cast_fp16, var_10109_cast_fp16, var_10125_cast_fp16, var_10125_cast_fp16))[name = string("key_heads_103_cast_fp16")];
+            bool value_heads_103_interleave_0 = const()[name = string("value_heads_103_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_103_cast_fp16 = concat(axis = var_9855, interleave = value_heads_103_interleave_0, values = (var_10017_cast_fp16, var_10017_cast_fp16, var_10033_cast_fp16, var_10033_cast_fp16, var_10049_cast_fp16, var_10049_cast_fp16, var_10065_cast_fp16, var_10065_cast_fp16, var_10081_cast_fp16, var_10081_cast_fp16, var_10097_cast_fp16, var_10097_cast_fp16, var_10113_cast_fp16, var_10113_cast_fp16, var_10129_cast_fp16, var_10129_cast_fp16))[name = string("value_heads_103_cast_fp16")];
+            fp16 var_10152_to_fp16 = const()[name = string("op_10152_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_10153_cast_fp16 = mul(x = mh_q_153_cast_fp16, y = var_10152_to_fp16)[name = string("op_10153_cast_fp16")];
+            bool mh_w_101_transpose_x_0 = const()[name = string("mh_w_101_transpose_x_0"), val = bool(true)];
+            bool mh_w_101_transpose_y_0 = const()[name = string("mh_w_101_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_101_cast_fp16 = matmul(transpose_x = mh_w_101_transpose_x_0, transpose_y = mh_w_101_transpose_y_0, x = var_10153_cast_fp16, y = key_heads_103_cast_fp16)[name = string("mh_w_101_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_103_cast_fp16 = add(x = mh_w_101_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_103_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_10165_cast_fp16 = softmax(axis = var_9837, x = mh_w_103_cast_fp16)[name = string("op_10165_cast_fp16")];
+            bool attn_51_transpose_x_0 = const()[name = string("attn_51_transpose_x_0"), val = bool(false)];
+            bool attn_51_transpose_y_0 = const()[name = string("attn_51_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_51_cast_fp16 = matmul(transpose_x = attn_51_transpose_x_0, transpose_y = attn_51_transpose_y_0, x = value_heads_103_cast_fp16, y = var_10165_cast_fp16)[name = string("attn_51_cast_fp16")];
+            tensor<int32, [4]> var_10170 = const()[name = string("op_10170"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_201_cast_fp16 = reshape(shape = var_10170, x = attn_51_cast_fp16)[name = string("input_201_cast_fp16")];
+            string obj_211_pad_type_0 = const()[name = string("obj_211_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_211_strides_0 = const()[name = string("obj_211_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_211_pad_0 = const()[name = string("obj_211_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_211_dilations_0 = const()[name = string("obj_211_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_211_groups_0 = const()[name = string("obj_211_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_25_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397722112))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399819328))))[name = string("layers_25_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_211_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_211_dilations_0, groups = obj_211_groups_0, pad = obj_211_pad_0, pad_type = obj_211_pad_type_0, strides = obj_211_strides_0, weight = layers_25_self_attn_o_proj_weight_to_fp16_palettized, x = input_201_cast_fp16)[name = string("obj_211_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_205_cast_fp16 = add(x = inputs_199_cast_fp16, y = obj_211_cast_fp16)[name = string("inputs_205_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_207_cast_fp16 = mul(x = inputs_205_cast_fp16, y = inputs_205_cast_fp16)[name = string("inputs_sq_207_cast_fp16")];
+            tensor<int32, [1]> variance_207_axes_0 = const()[name = string("variance_207_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_207_keep_dims_0 = const()[name = string("variance_207_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_207_cast_fp16 = reduce_mean(axes = variance_207_axes_0, keep_dims = variance_207_keep_dims_0, x = inputs_sq_207_cast_fp16)[name = string("variance_207_cast_fp16")];
+            fp16 var_10188_to_fp16 = const()[name = string("op_10188_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10189_cast_fp16 = add(x = variance_207_cast_fp16, y = var_10188_to_fp16)[name = string("op_10189_cast_fp16")];
+            fp32 var_10190_epsilon_0 = const()[name = string("op_10190_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10190_cast_fp16 = rsqrt(epsilon = var_10190_epsilon_0, x = var_10189_cast_fp16)[name = string("op_10190_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_257_cast_fp16 = mul(x = inputs_205_cast_fp16, y = var_10190_cast_fp16)[name = string("hidden_states_257_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_207_to_fp16 = const()[name = string("w_207_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399819904)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_203_cast_fp16 = mul(x = w_207_to_fp16, y = hidden_states_257_cast_fp16)[name = string("input_203_cast_fp16")];
+            string input_205_pad_type_0 = const()[name = string("input_205_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_205_strides_0 = const()[name = string("input_205_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_205_pad_0 = const()[name = string("input_205_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_205_dilations_0 = const()[name = string("input_205_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_205_groups_0 = const()[name = string("input_205_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_25_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399822016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(402967808))))[name = string("layers_25_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_205_cast_fp16 = conv(dilations = input_205_dilations_0, groups = input_205_groups_0, pad = input_205_pad_0, pad_type = input_205_pad_type_0, strides = input_205_strides_0, weight = layers_25_mlp_gate_proj_weight_to_fp16_palettized, x = input_203_cast_fp16)[name = string("input_205_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_10204_cast_fp16 = silu(x = input_205_cast_fp16)[name = string("op_10204_cast_fp16")];
+            string var_10210_pad_type_0 = const()[name = string("op_10210_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10210_strides_0 = const()[name = string("op_10210_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10210_pad_0 = const()[name = string("op_10210_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10210_dilations_0 = const()[name = string("op_10210_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10210_groups_0 = const()[name = string("op_10210_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_25_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(402968384))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406114176))))[name = string("layers_25_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_10210_cast_fp16 = conv(dilations = var_10210_dilations_0, groups = var_10210_groups_0, pad = var_10210_pad_0, pad_type = var_10210_pad_type_0, strides = var_10210_strides_0, weight = layers_25_mlp_up_proj_weight_to_fp16_palettized, x = input_203_cast_fp16)[name = string("op_10210_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_207_cast_fp16 = mul(x = var_10204_cast_fp16, y = var_10210_cast_fp16)[name = string("input_207_cast_fp16")];
+            string hidden_states_259_pad_type_0 = const()[name = string("hidden_states_259_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_259_strides_0 = const()[name = string("hidden_states_259_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_259_pad_0 = const()[name = string("hidden_states_259_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_259_dilations_0 = const()[name = string("hidden_states_259_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_259_groups_0 = const()[name = string("hidden_states_259_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_25_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406114752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409260544))))[name = string("layers_25_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_259_cast_fp16 = conv(dilations = hidden_states_259_dilations_0, groups = hidden_states_259_groups_0, pad = hidden_states_259_pad_0, pad_type = hidden_states_259_pad_type_0, strides = hidden_states_259_strides_0, weight = layers_25_mlp_down_proj_weight_to_fp16_palettized, x = input_207_cast_fp16)[name = string("hidden_states_259_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_207_cast_fp16 = add(x = inputs_205_cast_fp16, y = hidden_states_259_cast_fp16)[name = string("inputs_207_cast_fp16")];
+            int32 var_10224 = const()[name = string("op_10224"), val = int32(3)];
+            int32 var_10234 = const()[name = string("op_10234"), val = int32(-2)];
+            int32 var_10242 = const()[name = string("op_10242"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_209_cast_fp16 = mul(x = inputs_207_cast_fp16, y = inputs_207_cast_fp16)[name = string("inputs_sq_209_cast_fp16")];
+            tensor<int32, [1]> variance_209_axes_0 = const()[name = string("variance_209_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_209_keep_dims_0 = const()[name = string("variance_209_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_209_cast_fp16 = reduce_mean(axes = variance_209_axes_0, keep_dims = variance_209_keep_dims_0, x = inputs_sq_209_cast_fp16)[name = string("variance_209_cast_fp16")];
+            fp16 var_10254_to_fp16 = const()[name = string("op_10254_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10255_cast_fp16 = add(x = variance_209_cast_fp16, y = var_10254_to_fp16)[name = string("op_10255_cast_fp16")];
+            fp32 var_10256_epsilon_0 = const()[name = string("op_10256_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10256_cast_fp16 = rsqrt(epsilon = var_10256_epsilon_0, x = var_10255_cast_fp16)[name = string("op_10256_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_261_cast_fp16 = mul(x = inputs_207_cast_fp16, y = var_10256_cast_fp16)[name = string("hidden_states_261_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_209_to_fp16 = const()[name = string("w_209_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409261120)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_213_cast_fp16 = mul(x = w_209_to_fp16, y = hidden_states_261_cast_fp16)[name = string("obj_213_cast_fp16")];
+            string query_157_pad_type_0 = const()[name = string("query_157_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_157_strides_0 = const()[name = string("query_157_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_157_pad_0 = const()[name = string("query_157_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_157_dilations_0 = const()[name = string("query_157_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_157_groups_0 = const()[name = string("query_157_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_26_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409263232))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(411360448))))[name = string("layers_26_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_157_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_157_dilations_0, groups = query_157_groups_0, pad = query_157_pad_0, pad_type = query_157_pad_type_0, strides = query_157_strides_0, weight = layers_26_self_attn_q_proj_weight_to_fp16_palettized, x = obj_213_cast_fp16)[name = string("query_157_cast_fp16")];
+            string current_key_105_pad_type_0 = const()[name = string("current_key_105_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_105_strides_0 = const()[name = string("current_key_105_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_105_pad_0 = const()[name = string("current_key_105_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_105_dilations_0 = const()[name = string("current_key_105_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_105_groups_0 = const()[name = string("current_key_105_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_26_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(411361024))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412409664))))[name = string("layers_26_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_105_cast_fp16 = conv(dilations = current_key_105_dilations_0, groups = current_key_105_groups_0, pad = current_key_105_pad_0, pad_type = current_key_105_pad_type_0, strides = current_key_105_strides_0, weight = layers_26_self_attn_k_proj_weight_to_fp16_palettized, x = obj_213_cast_fp16)[name = string("current_key_105_cast_fp16")];
+            string current_value_53_pad_type_0 = const()[name = string("current_value_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_53_strides_0 = const()[name = string("current_value_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_53_pad_0 = const()[name = string("current_value_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_53_dilations_0 = const()[name = string("current_value_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_53_groups_0 = const()[name = string("current_value_53_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_26_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412410240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413458880))))[name = string("layers_26_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_53_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_53_dilations_0, groups = current_value_53_groups_0, pad = current_value_53_pad_0, pad_type = current_value_53_pad_type_0, strides = current_value_53_strides_0, weight = layers_26_self_attn_v_proj_weight_to_fp16_palettized, x = obj_213_cast_fp16)[name = string("current_value_53_cast_fp16")];
+            tensor<int32, [4]> var_10293 = const()[name = string("op_10293"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_209_cast_fp16 = reshape(shape = var_10293, x = query_157_cast_fp16)[name = string("inputs_209_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_211_cast_fp16 = mul(x = inputs_209_cast_fp16, y = inputs_209_cast_fp16)[name = string("inputs_sq_211_cast_fp16")];
+            tensor<int32, [1]> variance_211_axes_0 = const()[name = string("variance_211_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_211_keep_dims_0 = const()[name = string("variance_211_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_211_cast_fp16 = reduce_mean(axes = variance_211_axes_0, keep_dims = variance_211_keep_dims_0, x = inputs_sq_211_cast_fp16)[name = string("variance_211_cast_fp16")];
+            fp16 var_10299_to_fp16 = const()[name = string("op_10299_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_10300_cast_fp16 = add(x = variance_211_cast_fp16, y = var_10299_to_fp16)[name = string("op_10300_cast_fp16")];
+            fp32 var_10301_epsilon_0 = const()[name = string("op_10301_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_10301_cast_fp16 = rsqrt(epsilon = var_10301_epsilon_0, x = var_10300_cast_fp16)[name = string("op_10301_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_263_cast_fp16 = mul(x = inputs_209_cast_fp16, y = var_10301_cast_fp16)[name = string("hidden_states_263_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_211_to_fp16 = const()[name = string("w_211_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413459456)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_53_cast_fp16 = mul(x = w_211_to_fp16, y = hidden_states_263_cast_fp16)[name = string("query_normed_53_cast_fp16")];
+            tensor<int32, [4]> var_10309 = const()[name = string("op_10309"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_211_cast_fp16 = reshape(shape = var_10309, x = current_key_105_cast_fp16)[name = string("inputs_211_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_213_cast_fp16 = mul(x = inputs_211_cast_fp16, y = inputs_211_cast_fp16)[name = string("inputs_sq_213_cast_fp16")];
+            tensor<int32, [1]> variance_213_axes_0 = const()[name = string("variance_213_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_213_keep_dims_0 = const()[name = string("variance_213_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_213_cast_fp16 = reduce_mean(axes = variance_213_axes_0, keep_dims = variance_213_keep_dims_0, x = inputs_sq_213_cast_fp16)[name = string("variance_213_cast_fp16")];
+            fp16 var_10315_to_fp16 = const()[name = string("op_10315_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_10316_cast_fp16 = add(x = variance_213_cast_fp16, y = var_10315_to_fp16)[name = string("op_10316_cast_fp16")];
+            fp32 var_10317_epsilon_0 = const()[name = string("op_10317_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_10317_cast_fp16 = rsqrt(epsilon = var_10317_epsilon_0, x = var_10316_cast_fp16)[name = string("op_10317_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_265_cast_fp16 = mul(x = inputs_211_cast_fp16, y = var_10317_cast_fp16)[name = string("hidden_states_265_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_213_to_fp16 = const()[name = string("w_213_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413459776)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_53_cast_fp16 = mul(x = w_213_to_fp16, y = hidden_states_265_cast_fp16)[name = string("current_key_normed_53_cast_fp16")];
+            tensor<int32, [4]> var_10335 = const()[name = string("op_10335"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_157_cast_fp16 = reshape(shape = var_10335, x = query_normed_53_cast_fp16)[name = string("mh_q_157_cast_fp16")];
+            tensor<int32, [4]> var_10337 = const()[name = string("op_10337"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_105_cast_fp16 = reshape(shape = var_10337, x = current_key_normed_53_cast_fp16)[name = string("mh_k_105_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10341_cast_fp16 = mul(x = mh_q_157_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10341_cast_fp16")];
+            tensor<int32, [4]> var_10346_begin_0 = const()[name = string("op_10346_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10346_end_0 = const()[name = string("op_10346_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_10346_end_mask_0 = const()[name = string("op_10346_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10346_cast_fp16 = slice_by_index(begin = var_10346_begin_0, end = var_10346_end_0, end_mask = var_10346_end_mask_0, x = mh_q_157_cast_fp16)[name = string("op_10346_cast_fp16")];
+            tensor<int32, [4]> var_10352_begin_0 = const()[name = string("op_10352_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10352_end_0 = const()[name = string("op_10352_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_10352_end_mask_0 = const()[name = string("op_10352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10352_cast_fp16 = slice_by_index(begin = var_10352_begin_0, end = var_10352_end_0, end_mask = var_10352_end_mask_0, x = mh_q_157_cast_fp16)[name = string("op_10352_cast_fp16")];
+            fp16 const_615_promoted_to_fp16 = const()[name = string("const_615_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_10354_cast_fp16 = mul(x = var_10352_cast_fp16, y = const_615_promoted_to_fp16)[name = string("op_10354_cast_fp16")];
+            bool var_10356_interleave_0 = const()[name = string("op_10356_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_10356_cast_fp16 = concat(axis = var_10234, interleave = var_10356_interleave_0, values = (var_10354_cast_fp16, var_10346_cast_fp16))[name = string("op_10356_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10357_cast_fp16 = mul(x = var_10356_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10357_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_159_cast_fp16 = add(x = var_10341_cast_fp16, y = var_10357_cast_fp16)[name = string("mh_q_159_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10359_cast_fp16 = mul(x = mh_k_105_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10359_cast_fp16")];
+            tensor<int32, [4]> var_10364_begin_0 = const()[name = string("op_10364_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10364_end_0 = const()[name = string("op_10364_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_10364_end_mask_0 = const()[name = string("op_10364_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10364_cast_fp16 = slice_by_index(begin = var_10364_begin_0, end = var_10364_end_0, end_mask = var_10364_end_mask_0, x = mh_k_105_cast_fp16)[name = string("op_10364_cast_fp16")];
+            tensor<int32, [4]> var_10370_begin_0 = const()[name = string("op_10370_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10370_end_0 = const()[name = string("op_10370_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_10370_end_mask_0 = const()[name = string("op_10370_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10370_cast_fp16 = slice_by_index(begin = var_10370_begin_0, end = var_10370_end_0, end_mask = var_10370_end_mask_0, x = mh_k_105_cast_fp16)[name = string("op_10370_cast_fp16")];
+            fp16 const_618_promoted_to_fp16 = const()[name = string("const_618_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_10372_cast_fp16 = mul(x = var_10370_cast_fp16, y = const_618_promoted_to_fp16)[name = string("op_10372_cast_fp16")];
+            bool var_10374_interleave_0 = const()[name = string("op_10374_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_10374_cast_fp16 = concat(axis = var_10234, interleave = var_10374_interleave_0, values = (var_10372_cast_fp16, var_10364_cast_fp16))[name = string("op_10374_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10375_cast_fp16 = mul(x = var_10374_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10375_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_107_cast_fp16 = add(x = var_10359_cast_fp16, y = var_10375_cast_fp16)[name = string("mh_k_107_cast_fp16")];
+            tensor<int32, [4]> var_10379 = const()[name = string("op_10379"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_107_cast_fp16 = reshape(shape = var_10379, x = mh_k_107_cast_fp16)[name = string("current_key_107_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10386_cast_fp16 = mul(x = var_101_cast_fp16_26, y = var_323_cast_fp16)[name = string("op_10386_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10387_cast_fp16 = mul(x = current_key_107_cast_fp16, y = var_321_cast_fp16)[name = string("op_10387_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_159_cast_fp16 = add(x = var_10386_cast_fp16, y = var_10387_cast_fp16)[name = string("key_159_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10390_cast_fp16 = mul(x = var_132_cast_fp16_26, y = var_323_cast_fp16)[name = string("op_10390_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10391_cast_fp16 = mul(x = current_value_53_cast_fp16, y = var_321_cast_fp16)[name = string("op_10391_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_105_cast_fp16 = add(x = var_10390_cast_fp16, y = var_10391_cast_fp16)[name = string("value_105_cast_fp16")];
+            tensor<int32, [4]> var_10395 = const()[name = string("op_10395"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_105_cast_fp16 = reshape(shape = var_10395, x = key_159_cast_fp16)[name = string("key_heads_105_cast_fp16")];
+            tensor<int32, [4]> var_10397 = const()[name = string("op_10397"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_105_cast_fp16 = reshape(shape = var_10397, x = value_105_cast_fp16)[name = string("value_heads_105_cast_fp16")];
+            tensor<int32, [4]> var_10400_begin_0 = const()[name = string("op_10400_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10400_end_0 = const()[name = string("op_10400_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10400_end_mask_0 = const()[name = string("op_10400_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10400_cast_fp16 = slice_by_index(begin = var_10400_begin_0, end = var_10400_end_0, end_mask = var_10400_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10400_cast_fp16")];
+            tensor<int32, [4]> var_10404_begin_0 = const()[name = string("op_10404_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10404_end_0 = const()[name = string("op_10404_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10404_end_mask_0 = const()[name = string("op_10404_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10404_cast_fp16 = slice_by_index(begin = var_10404_begin_0, end = var_10404_end_0, end_mask = var_10404_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10404_cast_fp16")];
+            tensor<int32, [4]> var_10416_begin_0 = const()[name = string("op_10416_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10416_end_0 = const()[name = string("op_10416_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10416_end_mask_0 = const()[name = string("op_10416_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10416_cast_fp16 = slice_by_index(begin = var_10416_begin_0, end = var_10416_end_0, end_mask = var_10416_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10416_cast_fp16")];
+            tensor<int32, [4]> var_10420_begin_0 = const()[name = string("op_10420_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10420_end_0 = const()[name = string("op_10420_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10420_end_mask_0 = const()[name = string("op_10420_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10420_cast_fp16 = slice_by_index(begin = var_10420_begin_0, end = var_10420_end_0, end_mask = var_10420_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10420_cast_fp16")];
+            tensor<int32, [4]> var_10432_begin_0 = const()[name = string("op_10432_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10432_end_0 = const()[name = string("op_10432_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10432_end_mask_0 = const()[name = string("op_10432_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10432_cast_fp16 = slice_by_index(begin = var_10432_begin_0, end = var_10432_end_0, end_mask = var_10432_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10432_cast_fp16")];
+            tensor<int32, [4]> var_10436_begin_0 = const()[name = string("op_10436_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10436_end_0 = const()[name = string("op_10436_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10436_end_mask_0 = const()[name = string("op_10436_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10436_cast_fp16 = slice_by_index(begin = var_10436_begin_0, end = var_10436_end_0, end_mask = var_10436_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10436_cast_fp16")];
+            tensor<int32, [4]> var_10448_begin_0 = const()[name = string("op_10448_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10448_end_0 = const()[name = string("op_10448_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10448_end_mask_0 = const()[name = string("op_10448_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10448_cast_fp16 = slice_by_index(begin = var_10448_begin_0, end = var_10448_end_0, end_mask = var_10448_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10448_cast_fp16")];
+            tensor<int32, [4]> var_10452_begin_0 = const()[name = string("op_10452_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10452_end_0 = const()[name = string("op_10452_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10452_end_mask_0 = const()[name = string("op_10452_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10452_cast_fp16 = slice_by_index(begin = var_10452_begin_0, end = var_10452_end_0, end_mask = var_10452_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10452_cast_fp16")];
+            tensor<int32, [4]> var_10464_begin_0 = const()[name = string("op_10464_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10464_end_0 = const()[name = string("op_10464_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10464_end_mask_0 = const()[name = string("op_10464_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10464_cast_fp16 = slice_by_index(begin = var_10464_begin_0, end = var_10464_end_0, end_mask = var_10464_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10464_cast_fp16")];
+            tensor<int32, [4]> var_10468_begin_0 = const()[name = string("op_10468_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10468_end_0 = const()[name = string("op_10468_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10468_end_mask_0 = const()[name = string("op_10468_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10468_cast_fp16 = slice_by_index(begin = var_10468_begin_0, end = var_10468_end_0, end_mask = var_10468_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10468_cast_fp16")];
+            tensor<int32, [4]> var_10480_begin_0 = const()[name = string("op_10480_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10480_end_0 = const()[name = string("op_10480_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10480_end_mask_0 = const()[name = string("op_10480_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10480_cast_fp16 = slice_by_index(begin = var_10480_begin_0, end = var_10480_end_0, end_mask = var_10480_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10480_cast_fp16")];
+            tensor<int32, [4]> var_10484_begin_0 = const()[name = string("op_10484_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10484_end_0 = const()[name = string("op_10484_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10484_end_mask_0 = const()[name = string("op_10484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10484_cast_fp16 = slice_by_index(begin = var_10484_begin_0, end = var_10484_end_0, end_mask = var_10484_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10484_cast_fp16")];
+            tensor<int32, [4]> var_10496_begin_0 = const()[name = string("op_10496_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10496_end_0 = const()[name = string("op_10496_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10496_end_mask_0 = const()[name = string("op_10496_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10496_cast_fp16 = slice_by_index(begin = var_10496_begin_0, end = var_10496_end_0, end_mask = var_10496_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10496_cast_fp16")];
+            tensor<int32, [4]> var_10500_begin_0 = const()[name = string("op_10500_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10500_end_0 = const()[name = string("op_10500_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10500_end_mask_0 = const()[name = string("op_10500_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10500_cast_fp16 = slice_by_index(begin = var_10500_begin_0, end = var_10500_end_0, end_mask = var_10500_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10500_cast_fp16")];
+            tensor<int32, [4]> var_10512_begin_0 = const()[name = string("op_10512_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10512_end_0 = const()[name = string("op_10512_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10512_end_mask_0 = const()[name = string("op_10512_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10512_cast_fp16 = slice_by_index(begin = var_10512_begin_0, end = var_10512_end_0, end_mask = var_10512_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10512_cast_fp16")];
+            tensor<int32, [4]> var_10516_begin_0 = const()[name = string("op_10516_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10516_end_0 = const()[name = string("op_10516_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10516_end_mask_0 = const()[name = string("op_10516_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10516_cast_fp16 = slice_by_index(begin = var_10516_begin_0, end = var_10516_end_0, end_mask = var_10516_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10516_cast_fp16")];
+            bool key_heads_107_interleave_0 = const()[name = string("key_heads_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_107_cast_fp16 = concat(axis = var_10242, interleave = key_heads_107_interleave_0, values = (var_10400_cast_fp16, var_10400_cast_fp16, var_10416_cast_fp16, var_10416_cast_fp16, var_10432_cast_fp16, var_10432_cast_fp16, var_10448_cast_fp16, var_10448_cast_fp16, var_10464_cast_fp16, var_10464_cast_fp16, var_10480_cast_fp16, var_10480_cast_fp16, var_10496_cast_fp16, var_10496_cast_fp16, var_10512_cast_fp16, var_10512_cast_fp16))[name = string("key_heads_107_cast_fp16")];
+            bool value_heads_107_interleave_0 = const()[name = string("value_heads_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_107_cast_fp16 = concat(axis = var_10242, interleave = value_heads_107_interleave_0, values = (var_10404_cast_fp16, var_10404_cast_fp16, var_10420_cast_fp16, var_10420_cast_fp16, var_10436_cast_fp16, var_10436_cast_fp16, var_10452_cast_fp16, var_10452_cast_fp16, var_10468_cast_fp16, var_10468_cast_fp16, var_10484_cast_fp16, var_10484_cast_fp16, var_10500_cast_fp16, var_10500_cast_fp16, var_10516_cast_fp16, var_10516_cast_fp16))[name = string("value_heads_107_cast_fp16")];
+            fp16 var_10539_to_fp16 = const()[name = string("op_10539_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_10540_cast_fp16 = mul(x = mh_q_159_cast_fp16, y = var_10539_to_fp16)[name = string("op_10540_cast_fp16")];
+            bool mh_w_105_transpose_x_0 = const()[name = string("mh_w_105_transpose_x_0"), val = bool(true)];
+            bool mh_w_105_transpose_y_0 = const()[name = string("mh_w_105_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_105_cast_fp16 = matmul(transpose_x = mh_w_105_transpose_x_0, transpose_y = mh_w_105_transpose_y_0, x = var_10540_cast_fp16, y = key_heads_107_cast_fp16)[name = string("mh_w_105_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_107_cast_fp16 = add(x = mh_w_105_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_107_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_10552_cast_fp16 = softmax(axis = var_10224, x = mh_w_107_cast_fp16)[name = string("op_10552_cast_fp16")];
+            bool attn_53_transpose_x_0 = const()[name = string("attn_53_transpose_x_0"), val = bool(false)];
+            bool attn_53_transpose_y_0 = const()[name = string("attn_53_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_53_cast_fp16 = matmul(transpose_x = attn_53_transpose_x_0, transpose_y = attn_53_transpose_y_0, x = value_heads_107_cast_fp16, y = var_10552_cast_fp16)[name = string("attn_53_cast_fp16")];
+            tensor<int32, [4]> var_10557 = const()[name = string("op_10557"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_209_cast_fp16 = reshape(shape = var_10557, x = attn_53_cast_fp16)[name = string("input_209_cast_fp16")];
+            string obj_219_pad_type_0 = const()[name = string("obj_219_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_219_strides_0 = const()[name = string("obj_219_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_219_pad_0 = const()[name = string("obj_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_219_dilations_0 = const()[name = string("obj_219_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_219_groups_0 = const()[name = string("obj_219_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_26_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413460096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(415557312))))[name = string("layers_26_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_219_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_219_dilations_0, groups = obj_219_groups_0, pad = obj_219_pad_0, pad_type = obj_219_pad_type_0, strides = obj_219_strides_0, weight = layers_26_self_attn_o_proj_weight_to_fp16_palettized, x = input_209_cast_fp16)[name = string("obj_219_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_213_cast_fp16 = add(x = inputs_207_cast_fp16, y = obj_219_cast_fp16)[name = string("inputs_213_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_215_cast_fp16 = mul(x = inputs_213_cast_fp16, y = inputs_213_cast_fp16)[name = string("inputs_sq_215_cast_fp16")];
+            tensor<int32, [1]> variance_215_axes_0 = const()[name = string("variance_215_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_215_keep_dims_0 = const()[name = string("variance_215_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_215_cast_fp16 = reduce_mean(axes = variance_215_axes_0, keep_dims = variance_215_keep_dims_0, x = inputs_sq_215_cast_fp16)[name = string("variance_215_cast_fp16")];
+            fp16 var_10575_to_fp16 = const()[name = string("op_10575_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10576_cast_fp16 = add(x = variance_215_cast_fp16, y = var_10575_to_fp16)[name = string("op_10576_cast_fp16")];
+            fp32 var_10577_epsilon_0 = const()[name = string("op_10577_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10577_cast_fp16 = rsqrt(epsilon = var_10577_epsilon_0, x = var_10576_cast_fp16)[name = string("op_10577_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_267_cast_fp16 = mul(x = inputs_213_cast_fp16, y = var_10577_cast_fp16)[name = string("hidden_states_267_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_215_to_fp16 = const()[name = string("w_215_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(415557888)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_211_cast_fp16 = mul(x = w_215_to_fp16, y = hidden_states_267_cast_fp16)[name = string("input_211_cast_fp16")];
+            string input_213_pad_type_0 = const()[name = string("input_213_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_213_strides_0 = const()[name = string("input_213_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_213_pad_0 = const()[name = string("input_213_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_213_dilations_0 = const()[name = string("input_213_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_213_groups_0 = const()[name = string("input_213_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_26_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(415560000))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(418705792))))[name = string("layers_26_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_213_cast_fp16 = conv(dilations = input_213_dilations_0, groups = input_213_groups_0, pad = input_213_pad_0, pad_type = input_213_pad_type_0, strides = input_213_strides_0, weight = layers_26_mlp_gate_proj_weight_to_fp16_palettized, x = input_211_cast_fp16)[name = string("input_213_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_10591_cast_fp16 = silu(x = input_213_cast_fp16)[name = string("op_10591_cast_fp16")];
+            string var_10597_pad_type_0 = const()[name = string("op_10597_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10597_strides_0 = const()[name = string("op_10597_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10597_pad_0 = const()[name = string("op_10597_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10597_dilations_0 = const()[name = string("op_10597_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10597_groups_0 = const()[name = string("op_10597_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_26_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(418706368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(421852160))))[name = string("layers_26_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_10597_cast_fp16 = conv(dilations = var_10597_dilations_0, groups = var_10597_groups_0, pad = var_10597_pad_0, pad_type = var_10597_pad_type_0, strides = var_10597_strides_0, weight = layers_26_mlp_up_proj_weight_to_fp16_palettized, x = input_211_cast_fp16)[name = string("op_10597_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_215_cast_fp16 = mul(x = var_10591_cast_fp16, y = var_10597_cast_fp16)[name = string("input_215_cast_fp16")];
+            string hidden_states_269_pad_type_0 = const()[name = string("hidden_states_269_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_269_strides_0 = const()[name = string("hidden_states_269_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_269_pad_0 = const()[name = string("hidden_states_269_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_269_dilations_0 = const()[name = string("hidden_states_269_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_269_groups_0 = const()[name = string("hidden_states_269_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_26_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(421852736))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(424998528))))[name = string("layers_26_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_269_cast_fp16 = conv(dilations = hidden_states_269_dilations_0, groups = hidden_states_269_groups_0, pad = hidden_states_269_pad_0, pad_type = hidden_states_269_pad_type_0, strides = hidden_states_269_strides_0, weight = layers_26_mlp_down_proj_weight_to_fp16_palettized, x = input_215_cast_fp16)[name = string("hidden_states_269_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_215_cast_fp16 = add(x = inputs_213_cast_fp16, y = hidden_states_269_cast_fp16)[name = string("inputs_215_cast_fp16")];
+            int32 var_10611 = const()[name = string("op_10611"), val = int32(3)];
+            int32 var_10621 = const()[name = string("op_10621"), val = int32(-2)];
+            int32 var_10629 = const()[name = string("op_10629"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_217_cast_fp16 = mul(x = inputs_215_cast_fp16, y = inputs_215_cast_fp16)[name = string("inputs_sq_217_cast_fp16")];
+            tensor<int32, [1]> variance_217_axes_0 = const()[name = string("variance_217_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_217_keep_dims_0 = const()[name = string("variance_217_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_217_cast_fp16 = reduce_mean(axes = variance_217_axes_0, keep_dims = variance_217_keep_dims_0, x = inputs_sq_217_cast_fp16)[name = string("variance_217_cast_fp16")];
+            fp16 var_10641_to_fp16 = const()[name = string("op_10641_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10642_cast_fp16 = add(x = variance_217_cast_fp16, y = var_10641_to_fp16)[name = string("op_10642_cast_fp16")];
+            fp32 var_10643_epsilon_0 = const()[name = string("op_10643_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10643_cast_fp16 = rsqrt(epsilon = var_10643_epsilon_0, x = var_10642_cast_fp16)[name = string("op_10643_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_271_cast_fp16 = mul(x = inputs_215_cast_fp16, y = var_10643_cast_fp16)[name = string("hidden_states_271_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_217_to_fp16 = const()[name = string("w_217_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(424999104)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_221_cast_fp16 = mul(x = w_217_to_fp16, y = hidden_states_271_cast_fp16)[name = string("obj_221_cast_fp16")];
+            string query_163_pad_type_0 = const()[name = string("query_163_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_163_strides_0 = const()[name = string("query_163_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_163_pad_0 = const()[name = string("query_163_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_163_dilations_0 = const()[name = string("query_163_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_163_groups_0 = const()[name = string("query_163_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_27_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(425001216))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(427098432))))[name = string("layers_27_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_163_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_163_dilations_0, groups = query_163_groups_0, pad = query_163_pad_0, pad_type = query_163_pad_type_0, strides = query_163_strides_0, weight = layers_27_self_attn_q_proj_weight_to_fp16_palettized, x = obj_221_cast_fp16)[name = string("query_163_cast_fp16")];
+            string current_key_109_pad_type_0 = const()[name = string("current_key_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_109_strides_0 = const()[name = string("current_key_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_109_pad_0 = const()[name = string("current_key_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_109_dilations_0 = const()[name = string("current_key_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_109_groups_0 = const()[name = string("current_key_109_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_27_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(427099008))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(428147648))))[name = string("layers_27_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_109_cast_fp16 = conv(dilations = current_key_109_dilations_0, groups = current_key_109_groups_0, pad = current_key_109_pad_0, pad_type = current_key_109_pad_type_0, strides = current_key_109_strides_0, weight = layers_27_self_attn_k_proj_weight_to_fp16_palettized, x = obj_221_cast_fp16)[name = string("current_key_109_cast_fp16")];
+            string current_value_pad_type_0 = const()[name = string("current_value_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_strides_0 = const()[name = string("current_value_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_pad_0 = const()[name = string("current_value_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_dilations_0 = const()[name = string("current_value_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_groups_0 = const()[name = string("current_value_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_27_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(428148224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(429196864))))[name = string("layers_27_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_dilations_0, groups = current_value_groups_0, pad = current_value_pad_0, pad_type = current_value_pad_type_0, strides = current_value_strides_0, weight = layers_27_self_attn_v_proj_weight_to_fp16_palettized, x = obj_221_cast_fp16)[name = string("current_value_cast_fp16")];
+            tensor<int32, [4]> var_10680 = const()[name = string("op_10680"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_217_cast_fp16 = reshape(shape = var_10680, x = query_163_cast_fp16)[name = string("inputs_217_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_219_cast_fp16 = mul(x = inputs_217_cast_fp16, y = inputs_217_cast_fp16)[name = string("inputs_sq_219_cast_fp16")];
+            tensor<int32, [1]> variance_219_axes_0 = const()[name = string("variance_219_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_219_keep_dims_0 = const()[name = string("variance_219_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_219_cast_fp16 = reduce_mean(axes = variance_219_axes_0, keep_dims = variance_219_keep_dims_0, x = inputs_sq_219_cast_fp16)[name = string("variance_219_cast_fp16")];
+            fp16 var_10686_to_fp16 = const()[name = string("op_10686_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_10687_cast_fp16 = add(x = variance_219_cast_fp16, y = var_10686_to_fp16)[name = string("op_10687_cast_fp16")];
+            fp32 var_10688_epsilon_0 = const()[name = string("op_10688_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_10688_cast_fp16 = rsqrt(epsilon = var_10688_epsilon_0, x = var_10687_cast_fp16)[name = string("op_10688_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_273_cast_fp16 = mul(x = inputs_217_cast_fp16, y = var_10688_cast_fp16)[name = string("hidden_states_273_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_219_to_fp16 = const()[name = string("w_219_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(429197440)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_cast_fp16 = mul(x = w_219_to_fp16, y = hidden_states_273_cast_fp16)[name = string("query_normed_cast_fp16")];
+            tensor<int32, [4]> var_10696 = const()[name = string("op_10696"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_219_cast_fp16 = reshape(shape = var_10696, x = current_key_109_cast_fp16)[name = string("inputs_219_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_221_cast_fp16 = mul(x = inputs_219_cast_fp16, y = inputs_219_cast_fp16)[name = string("inputs_sq_221_cast_fp16")];
+            tensor<int32, [1]> variance_221_axes_0 = const()[name = string("variance_221_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_221_keep_dims_0 = const()[name = string("variance_221_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_221_cast_fp16 = reduce_mean(axes = variance_221_axes_0, keep_dims = variance_221_keep_dims_0, x = inputs_sq_221_cast_fp16)[name = string("variance_221_cast_fp16")];
+            fp16 var_10702_to_fp16 = const()[name = string("op_10702_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_10703_cast_fp16 = add(x = variance_221_cast_fp16, y = var_10702_to_fp16)[name = string("op_10703_cast_fp16")];
+            fp32 var_10704_epsilon_0 = const()[name = string("op_10704_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_10704_cast_fp16 = rsqrt(epsilon = var_10704_epsilon_0, x = var_10703_cast_fp16)[name = string("op_10704_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_275_cast_fp16 = mul(x = inputs_219_cast_fp16, y = var_10704_cast_fp16)[name = string("hidden_states_275_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_221_to_fp16 = const()[name = string("w_221_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(429197760)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_cast_fp16 = mul(x = w_221_to_fp16, y = hidden_states_275_cast_fp16)[name = string("current_key_normed_cast_fp16")];
+            tensor<int32, [4]> var_10722 = const()[name = string("op_10722"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_163_cast_fp16 = reshape(shape = var_10722, x = query_normed_cast_fp16)[name = string("mh_q_163_cast_fp16")];
+            tensor<int32, [4]> var_10724 = const()[name = string("op_10724"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_109_cast_fp16 = reshape(shape = var_10724, x = current_key_normed_cast_fp16)[name = string("mh_k_109_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10728_cast_fp16 = mul(x = mh_q_163_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10728_cast_fp16")];
+            tensor<int32, [4]> var_10733_begin_0 = const()[name = string("op_10733_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10733_end_0 = const()[name = string("op_10733_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_10733_end_mask_0 = const()[name = string("op_10733_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10733_cast_fp16 = slice_by_index(begin = var_10733_begin_0, end = var_10733_end_0, end_mask = var_10733_end_mask_0, x = mh_q_163_cast_fp16)[name = string("op_10733_cast_fp16")];
+            tensor<int32, [4]> var_10739_begin_0 = const()[name = string("op_10739_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10739_end_0 = const()[name = string("op_10739_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_10739_end_mask_0 = const()[name = string("op_10739_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10739_cast_fp16 = slice_by_index(begin = var_10739_begin_0, end = var_10739_end_0, end_mask = var_10739_end_mask_0, x = mh_q_163_cast_fp16)[name = string("op_10739_cast_fp16")];
+            fp16 const_638_promoted_to_fp16 = const()[name = string("const_638_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_10741_cast_fp16 = mul(x = var_10739_cast_fp16, y = const_638_promoted_to_fp16)[name = string("op_10741_cast_fp16")];
+            bool var_10743_interleave_0 = const()[name = string("op_10743_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_10743_cast_fp16 = concat(axis = var_10621, interleave = var_10743_interleave_0, values = (var_10741_cast_fp16, var_10733_cast_fp16))[name = string("op_10743_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10744_cast_fp16 = mul(x = var_10743_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10744_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_165_cast_fp16 = add(x = var_10728_cast_fp16, y = var_10744_cast_fp16)[name = string("mh_q_165_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10746_cast_fp16 = mul(x = mh_k_109_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10746_cast_fp16")];
+            tensor<int32, [4]> var_10751_begin_0 = const()[name = string("op_10751_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10751_end_0 = const()[name = string("op_10751_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_10751_end_mask_0 = const()[name = string("op_10751_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10751_cast_fp16 = slice_by_index(begin = var_10751_begin_0, end = var_10751_end_0, end_mask = var_10751_end_mask_0, x = mh_k_109_cast_fp16)[name = string("op_10751_cast_fp16")];
+            tensor<int32, [4]> var_10757_begin_0 = const()[name = string("op_10757_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10757_end_0 = const()[name = string("op_10757_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_10757_end_mask_0 = const()[name = string("op_10757_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10757_cast_fp16 = slice_by_index(begin = var_10757_begin_0, end = var_10757_end_0, end_mask = var_10757_end_mask_0, x = mh_k_109_cast_fp16)[name = string("op_10757_cast_fp16")];
+            fp16 const_641_promoted_to_fp16 = const()[name = string("const_641_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_10759_cast_fp16 = mul(x = var_10757_cast_fp16, y = const_641_promoted_to_fp16)[name = string("op_10759_cast_fp16")];
+            bool var_10761_interleave_0 = const()[name = string("op_10761_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_10761_cast_fp16 = concat(axis = var_10621, interleave = var_10761_interleave_0, values = (var_10759_cast_fp16, var_10751_cast_fp16))[name = string("op_10761_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10762_cast_fp16 = mul(x = var_10761_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10762_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_cast_fp16 = add(x = var_10746_cast_fp16, y = var_10762_cast_fp16)[name = string("mh_k_cast_fp16")];
+            tensor<int32, [4]> var_10766 = const()[name = string("op_10766"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_10766, x = mh_k_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10773_cast_fp16 = mul(x = var_101_cast_fp16_27, y = var_323_cast_fp16)[name = string("op_10773_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10774_cast_fp16 = mul(x = current_key_cast_fp16, y = var_321_cast_fp16)[name = string("op_10774_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_165_cast_fp16 = add(x = var_10773_cast_fp16, y = var_10774_cast_fp16)[name = string("key_165_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10777_cast_fp16 = mul(x = var_132_cast_fp16_27, y = var_323_cast_fp16)[name = string("op_10777_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10778_cast_fp16 = mul(x = current_value_cast_fp16, y = var_321_cast_fp16)[name = string("op_10778_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_109_cast_fp16 = add(x = var_10777_cast_fp16, y = var_10778_cast_fp16)[name = string("value_109_cast_fp16")];
+            tensor<int32, [4]> var_10782 = const()[name = string("op_10782"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_109_cast_fp16 = reshape(shape = var_10782, x = key_165_cast_fp16)[name = string("key_heads_109_cast_fp16")];
+            tensor<int32, [4]> var_10784 = const()[name = string("op_10784"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_109_cast_fp16 = reshape(shape = var_10784, x = value_109_cast_fp16)[name = string("value_heads_109_cast_fp16")];
+            tensor<int32, [4]> var_10787_begin_0 = const()[name = string("op_10787_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10787_end_0 = const()[name = string("op_10787_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10787_end_mask_0 = const()[name = string("op_10787_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10787_cast_fp16 = slice_by_index(begin = var_10787_begin_0, end = var_10787_end_0, end_mask = var_10787_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10787_cast_fp16")];
+            tensor<int32, [4]> var_10791_begin_0 = const()[name = string("op_10791_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10791_end_0 = const()[name = string("op_10791_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10791_end_mask_0 = const()[name = string("op_10791_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10791_cast_fp16 = slice_by_index(begin = var_10791_begin_0, end = var_10791_end_0, end_mask = var_10791_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10791_cast_fp16")];
+            tensor<int32, [4]> var_10803_begin_0 = const()[name = string("op_10803_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10803_end_0 = const()[name = string("op_10803_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10803_end_mask_0 = const()[name = string("op_10803_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10803_cast_fp16 = slice_by_index(begin = var_10803_begin_0, end = var_10803_end_0, end_mask = var_10803_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10803_cast_fp16")];
+            tensor<int32, [4]> var_10807_begin_0 = const()[name = string("op_10807_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10807_end_0 = const()[name = string("op_10807_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10807_end_mask_0 = const()[name = string("op_10807_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10807_cast_fp16 = slice_by_index(begin = var_10807_begin_0, end = var_10807_end_0, end_mask = var_10807_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10807_cast_fp16")];
+            tensor<int32, [4]> var_10819_begin_0 = const()[name = string("op_10819_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10819_end_0 = const()[name = string("op_10819_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10819_end_mask_0 = const()[name = string("op_10819_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10819_cast_fp16 = slice_by_index(begin = var_10819_begin_0, end = var_10819_end_0, end_mask = var_10819_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10819_cast_fp16")];
+            tensor<int32, [4]> var_10823_begin_0 = const()[name = string("op_10823_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10823_end_0 = const()[name = string("op_10823_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10823_end_mask_0 = const()[name = string("op_10823_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10823_cast_fp16 = slice_by_index(begin = var_10823_begin_0, end = var_10823_end_0, end_mask = var_10823_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10823_cast_fp16")];
+            tensor<int32, [4]> var_10835_begin_0 = const()[name = string("op_10835_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10835_end_0 = const()[name = string("op_10835_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10835_end_mask_0 = const()[name = string("op_10835_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10835_cast_fp16 = slice_by_index(begin = var_10835_begin_0, end = var_10835_end_0, end_mask = var_10835_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10835_cast_fp16")];
+            tensor<int32, [4]> var_10839_begin_0 = const()[name = string("op_10839_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10839_end_0 = const()[name = string("op_10839_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10839_end_mask_0 = const()[name = string("op_10839_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10839_cast_fp16 = slice_by_index(begin = var_10839_begin_0, end = var_10839_end_0, end_mask = var_10839_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10839_cast_fp16")];
+            tensor<int32, [4]> var_10851_begin_0 = const()[name = string("op_10851_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10851_end_0 = const()[name = string("op_10851_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10851_end_mask_0 = const()[name = string("op_10851_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10851_cast_fp16 = slice_by_index(begin = var_10851_begin_0, end = var_10851_end_0, end_mask = var_10851_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10851_cast_fp16")];
+            tensor<int32, [4]> var_10855_begin_0 = const()[name = string("op_10855_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10855_end_0 = const()[name = string("op_10855_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10855_end_mask_0 = const()[name = string("op_10855_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10855_cast_fp16 = slice_by_index(begin = var_10855_begin_0, end = var_10855_end_0, end_mask = var_10855_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10855_cast_fp16")];
+            tensor<int32, [4]> var_10867_begin_0 = const()[name = string("op_10867_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10867_end_0 = const()[name = string("op_10867_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10867_end_mask_0 = const()[name = string("op_10867_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10867_cast_fp16 = slice_by_index(begin = var_10867_begin_0, end = var_10867_end_0, end_mask = var_10867_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10867_cast_fp16")];
+            tensor<int32, [4]> var_10871_begin_0 = const()[name = string("op_10871_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10871_end_0 = const()[name = string("op_10871_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10871_end_mask_0 = const()[name = string("op_10871_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10871_cast_fp16 = slice_by_index(begin = var_10871_begin_0, end = var_10871_end_0, end_mask = var_10871_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10871_cast_fp16")];
+            tensor<int32, [4]> var_10883_begin_0 = const()[name = string("op_10883_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10883_end_0 = const()[name = string("op_10883_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10883_end_mask_0 = const()[name = string("op_10883_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10883_cast_fp16 = slice_by_index(begin = var_10883_begin_0, end = var_10883_end_0, end_mask = var_10883_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10883_cast_fp16")];
+            tensor<int32, [4]> var_10887_begin_0 = const()[name = string("op_10887_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10887_end_0 = const()[name = string("op_10887_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10887_end_mask_0 = const()[name = string("op_10887_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10887_cast_fp16 = slice_by_index(begin = var_10887_begin_0, end = var_10887_end_0, end_mask = var_10887_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10887_cast_fp16")];
+            tensor<int32, [4]> var_10899_begin_0 = const()[name = string("op_10899_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10899_end_0 = const()[name = string("op_10899_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10899_end_mask_0 = const()[name = string("op_10899_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10899_cast_fp16 = slice_by_index(begin = var_10899_begin_0, end = var_10899_end_0, end_mask = var_10899_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10899_cast_fp16")];
+            tensor<int32, [4]> var_10903_begin_0 = const()[name = string("op_10903_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10903_end_0 = const()[name = string("op_10903_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10903_end_mask_0 = const()[name = string("op_10903_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10903_cast_fp16 = slice_by_index(begin = var_10903_begin_0, end = var_10903_end_0, end_mask = var_10903_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10903_cast_fp16")];
+            bool key_heads_interleave_0 = const()[name = string("key_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_cast_fp16 = concat(axis = var_10629, interleave = key_heads_interleave_0, values = (var_10787_cast_fp16, var_10787_cast_fp16, var_10803_cast_fp16, var_10803_cast_fp16, var_10819_cast_fp16, var_10819_cast_fp16, var_10835_cast_fp16, var_10835_cast_fp16, var_10851_cast_fp16, var_10851_cast_fp16, var_10867_cast_fp16, var_10867_cast_fp16, var_10883_cast_fp16, var_10883_cast_fp16, var_10899_cast_fp16, var_10899_cast_fp16))[name = string("key_heads_cast_fp16")];
+            bool value_heads_interleave_0 = const()[name = string("value_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_cast_fp16 = concat(axis = var_10629, interleave = value_heads_interleave_0, values = (var_10791_cast_fp16, var_10791_cast_fp16, var_10807_cast_fp16, var_10807_cast_fp16, var_10823_cast_fp16, var_10823_cast_fp16, var_10839_cast_fp16, var_10839_cast_fp16, var_10855_cast_fp16, var_10855_cast_fp16, var_10871_cast_fp16, var_10871_cast_fp16, var_10887_cast_fp16, var_10887_cast_fp16, var_10903_cast_fp16, var_10903_cast_fp16))[name = string("value_heads_cast_fp16")];
+            fp16 var_10926_to_fp16 = const()[name = string("op_10926_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_10927_cast_fp16 = mul(x = mh_q_165_cast_fp16, y = var_10926_to_fp16)[name = string("op_10927_cast_fp16")];
+            bool mh_w_109_transpose_x_0 = const()[name = string("mh_w_109_transpose_x_0"), val = bool(true)];
+            bool mh_w_109_transpose_y_0 = const()[name = string("mh_w_109_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_109_cast_fp16 = matmul(transpose_x = mh_w_109_transpose_x_0, transpose_y = mh_w_109_transpose_y_0, x = var_10927_cast_fp16, y = key_heads_cast_fp16)[name = string("mh_w_109_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_cast_fp16 = add(x = mh_w_109_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_10939_cast_fp16 = softmax(axis = var_10611, x = mh_w_cast_fp16)[name = string("op_10939_cast_fp16")];
+            bool attn_transpose_x_0 = const()[name = string("attn_transpose_x_0"), val = bool(false)];
+            bool attn_transpose_y_0 = const()[name = string("attn_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_cast_fp16 = matmul(transpose_x = attn_transpose_x_0, transpose_y = attn_transpose_y_0, x = value_heads_cast_fp16, y = var_10939_cast_fp16)[name = string("attn_cast_fp16")];
+            tensor<int32, [4]> var_10944 = const()[name = string("op_10944"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_217_cast_fp16 = reshape(shape = var_10944, x = attn_cast_fp16)[name = string("input_217_cast_fp16")];
+            string obj_pad_type_0 = const()[name = string("obj_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_strides_0 = const()[name = string("obj_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_pad_0 = const()[name = string("obj_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_dilations_0 = const()[name = string("obj_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_groups_0 = const()[name = string("obj_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_27_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(429198080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(431295296))))[name = string("layers_27_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_dilations_0, groups = obj_groups_0, pad = obj_pad_0, pad_type = obj_pad_type_0, strides = obj_strides_0, weight = layers_27_self_attn_o_proj_weight_to_fp16_palettized, x = input_217_cast_fp16)[name = string("obj_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_221_cast_fp16 = add(x = inputs_215_cast_fp16, y = obj_cast_fp16)[name = string("inputs_221_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_223_cast_fp16 = mul(x = inputs_221_cast_fp16, y = inputs_221_cast_fp16)[name = string("inputs_sq_223_cast_fp16")];
+            tensor<int32, [1]> variance_223_axes_0 = const()[name = string("variance_223_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_223_keep_dims_0 = const()[name = string("variance_223_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_223_cast_fp16 = reduce_mean(axes = variance_223_axes_0, keep_dims = variance_223_keep_dims_0, x = inputs_sq_223_cast_fp16)[name = string("variance_223_cast_fp16")];
+            fp16 var_10962_to_fp16 = const()[name = string("op_10962_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10963_cast_fp16 = add(x = variance_223_cast_fp16, y = var_10962_to_fp16)[name = string("op_10963_cast_fp16")];
+            fp32 var_10964_epsilon_0 = const()[name = string("op_10964_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10964_cast_fp16 = rsqrt(epsilon = var_10964_epsilon_0, x = var_10963_cast_fp16)[name = string("op_10964_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_277_cast_fp16 = mul(x = inputs_221_cast_fp16, y = var_10964_cast_fp16)[name = string("hidden_states_277_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_223_to_fp16 = const()[name = string("w_223_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(431295872)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_219_cast_fp16 = mul(x = w_223_to_fp16, y = hidden_states_277_cast_fp16)[name = string("input_219_cast_fp16")];
+            string input_221_pad_type_0 = const()[name = string("input_221_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_221_strides_0 = const()[name = string("input_221_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_221_pad_0 = const()[name = string("input_221_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_221_dilations_0 = const()[name = string("input_221_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_221_groups_0 = const()[name = string("input_221_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_27_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(431297984))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(434443776))))[name = string("layers_27_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_221_cast_fp16 = conv(dilations = input_221_dilations_0, groups = input_221_groups_0, pad = input_221_pad_0, pad_type = input_221_pad_type_0, strides = input_221_strides_0, weight = layers_27_mlp_gate_proj_weight_to_fp16_palettized, x = input_219_cast_fp16)[name = string("input_221_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_10978_cast_fp16 = silu(x = input_221_cast_fp16)[name = string("op_10978_cast_fp16")];
+            string var_10984_pad_type_0 = const()[name = string("op_10984_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10984_strides_0 = const()[name = string("op_10984_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10984_pad_0 = const()[name = string("op_10984_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10984_dilations_0 = const()[name = string("op_10984_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10984_groups_0 = const()[name = string("op_10984_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_27_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(434444352))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(437590144))))[name = string("layers_27_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_10984_cast_fp16 = conv(dilations = var_10984_dilations_0, groups = var_10984_groups_0, pad = var_10984_pad_0, pad_type = var_10984_pad_type_0, strides = var_10984_strides_0, weight = layers_27_mlp_up_proj_weight_to_fp16_palettized, x = input_219_cast_fp16)[name = string("op_10984_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_223_cast_fp16 = mul(x = var_10978_cast_fp16, y = var_10984_cast_fp16)[name = string("input_223_cast_fp16")];
+            string hidden_states_279_pad_type_0 = const()[name = string("hidden_states_279_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_279_strides_0 = const()[name = string("hidden_states_279_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_279_pad_0 = const()[name = string("hidden_states_279_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_279_dilations_0 = const()[name = string("hidden_states_279_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_279_groups_0 = const()[name = string("hidden_states_279_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_27_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(437590720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440736512))))[name = string("layers_27_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_279_cast_fp16 = conv(dilations = hidden_states_279_dilations_0, groups = hidden_states_279_groups_0, pad = hidden_states_279_pad_0, pad_type = hidden_states_279_pad_type_0, strides = hidden_states_279_strides_0, weight = layers_27_mlp_down_proj_weight_to_fp16_palettized, x = input_223_cast_fp16)[name = string("hidden_states_279_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_cast_fp16 = add(x = inputs_221_cast_fp16, y = hidden_states_279_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_11005_to_fp16 = const()[name = string("op_11005_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_11006_cast_fp16 = add(x = variance_cast_fp16, y = var_11005_to_fp16)[name = string("op_11006_cast_fp16")];
+            fp32 var_11007_epsilon_0 = const()[name = string("op_11007_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_11007_cast_fp16 = rsqrt(epsilon = var_11007_epsilon_0, x = var_11006_cast_fp16)[name = string("op_11007_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_11007_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440737088)))];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_cast_fp16")];
+            string logits_pad_type_0 = const()[name = string("logits_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_strides_0 = const()[name = string("logits_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_pad_0 = const()[name = string("logits_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_dilations_0 = const()[name = string("logits_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_groups_0 = const()[name = string("logits_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> codec_head_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440739200))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443884992))))[name = string("codec_head_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> logits_cast_fp16 = conv(dilations = logits_dilations_0, groups = logits_groups_0, pad = logits_pad_0, pad_type = logits_pad_type_0, strides = logits_strides_0, weight = codec_head_weight_to_fp16_palettized, x = hidden_states)[name = string("logits_cast_fp16")];
+            tensor<int32, [1]> var_11024_axes_0 = const()[name = string("op_11024_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 3072, 1]> var_11024_cast_fp16 = squeeze(axes = var_11024_axes_0, x = logits_cast_fp16)[name = string("op_11024_cast_fp16")];
+            tensor<int32, [3]> var_11027_perm_0 = const()[name = string("op_11027_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_11029 = const()[name = string("op_11029"), val = int32(1)];
+            bool var_11030_interleave_0 = const()[name = string("op_11030_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 28672, 1, 1]> key_cache_updates = concat(axis = var_11029, interleave = var_11030_interleave_0, values = (current_key_3_cast_fp16, current_key_7_cast_fp16, current_key_11_cast_fp16, current_key_15_cast_fp16, current_key_19_cast_fp16, current_key_23_cast_fp16, current_key_27_cast_fp16, current_key_31_cast_fp16, current_key_35_cast_fp16, current_key_39_cast_fp16, current_key_43_cast_fp16, current_key_47_cast_fp16, current_key_51_cast_fp16, current_key_55_cast_fp16, current_key_59_cast_fp16, current_key_63_cast_fp16, current_key_67_cast_fp16, current_key_71_cast_fp16, current_key_75_cast_fp16, current_key_79_cast_fp16, current_key_83_cast_fp16, current_key_87_cast_fp16, current_key_91_cast_fp16, current_key_95_cast_fp16, current_key_99_cast_fp16, current_key_103_cast_fp16, current_key_107_cast_fp16, current_key_cast_fp16))[name = string("op_11030_cast_fp16")];
+            int32 var_11032 = const()[name = string("op_11032"), val = int32(1)];
+            bool var_11033_interleave_0 = const()[name = string("op_11033_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 28672, 1, 1]> value_cache_updates = concat(axis = var_11032, interleave = var_11033_interleave_0, values = (current_value_1_cast_fp16, current_value_3_cast_fp16, current_value_5_cast_fp16, current_value_7_cast_fp16, current_value_9_cast_fp16, current_value_11_cast_fp16, current_value_13_cast_fp16, current_value_15_cast_fp16, current_value_17_cast_fp16, current_value_19_cast_fp16, current_value_21_cast_fp16, current_value_23_cast_fp16, current_value_25_cast_fp16, current_value_27_cast_fp16, current_value_29_cast_fp16, current_value_31_cast_fp16, current_value_33_cast_fp16, current_value_35_cast_fp16, current_value_37_cast_fp16, current_value_39_cast_fp16, current_value_41_cast_fp16, current_value_43_cast_fp16, current_value_45_cast_fp16, current_value_47_cast_fp16, current_value_49_cast_fp16, current_value_51_cast_fp16, current_value_53_cast_fp16, current_value_cast_fp16))[name = string("op_11033_cast_fp16")];
+            tensor<fp16, [1, 1, 3072]> logits = transpose(perm = var_11027_perm_0, x = var_11024_cast_fp16)[name = string("transpose_0")];
+        } -> (logits, hidden_states, key_cache_updates, value_cache_updates);
+}
\ No newline at end of file
diff --git a/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/weights/weight.bin b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..12166c1637b29359e7e9662711e0b35ed84bccd4
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-0.6b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9618fdb2a96db80646b257659ec4cf01240283c211ea7fc3ad0d986d35304fa
+size 443885568
diff --git a/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ddfb50aca5f173a66894a8f482971a369f6b2428
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ae0466e386343e0079070977b713c72abd29e9653af179fbac1a0c656a7408b
+size 243
diff --git a/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/coremldata.bin b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..25afffaba132320d6188e7fb1555f33de4833590
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bac65aa923aecda6c84e1439769c47c440ab8f8afcdfd47b8d100b992dc7be1a
+size 671
diff --git a/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/metadata.json b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2cc48501987b32351fbe8496bdd24e1711f1bd2
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/metadata.json
@@ -0,0 +1,159 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Int32, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 3072)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 3072]",
+        "name" : "logits",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2048 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 2048, 1, 1]",
+        "name" : "hidden_states",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 28672 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 1]",
+        "name" : "key_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 28672 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 1]",
+        "name" : "value_cache_updates",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 8,
+      "Ios18.softmax" : 28,
+      "Ios18.mul" : 675,
+      "Ios18.matmul" : 56,
+      "Ios18.rsqrt" : 113,
+      "Ios16.reduceMean" : 113,
+      "Split" : 2,
+      "Ios18.greaterEqual" : 2,
+      "Select" : 2,
+      "Ios18.readState" : 2,
+      "Ios18.gather" : 2,
+      "Ios18.add" : 311,
+      "Ios18.reshape" : 224,
+      "Ios18.constexprLutToDense" : 199,
+      "Ios18.conv" : 197,
+      "Ios18.concat" : 114,
+      "Ios18.cast" : 5,
+      "Ios18.sub" : 1,
+      "Ios18.silu" : 28,
+      "Ios18.transpose" : 1,
+      "Ios18.sliceByIndex" : 560,
+      "Ios18.squeeze" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+      {
+        "dataType" : "Float16",
+        "isOptional" : "0",
+        "formattedType" : "State (Float16 1 × 28672 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 256]",
+        "name" : "self_attn_key_cache",
+        "type" : "State"
+      },
+      {
+        "dataType" : "Float16",
+        "isOptional" : "0",
+        "formattedType" : "State (Float16 1 × 28672 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 28672, 1, 256]",
+        "name" : "self_attn_value_cache",
+        "type" : "State"
+      }
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-17",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2048 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 2048, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "kv_cache_update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "key_padding_mask",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "CodeDecoderWithStatefulSelfAttention_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/model.mil b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..baa942b415d2505ab98d30b88c03159a4175a222
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/model.mil
@@ -0,0 +1,6532 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> cache_length, tensor<fp16, [1, 2048, 1, 1]> input_embeds, tensor<fp16, [1, 256]> key_padding_mask, tensor<fp16, [1, 256]> kv_cache_update_mask, state<tensor<fp16, [1, 28672, 1, 256]>> self_attn_key_cache, state<tensor<fp16, [1, 28672, 1, 256]>> self_attn_value_cache) {
+            int32 pos_cos_batch_dims_0 = const()[name = string("pos_cos_batch_dims_0"), val = int32(0)];
+            bool pos_cos_validate_indices_0 = const()[name = string("pos_cos_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 128]> position_embeddings_cos_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [256, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32896))))[name = string("position_embeddings_cos_weight_to_fp16_palettized")];
+            string cache_length_to_int16_dtype_0 = const()[name = string("cache_length_to_int16_dtype_0"), val = string("int16")];
+            string cast_572_dtype_0 = const()[name = string("cast_572_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> cache_length_to_int16 = cast(dtype = cache_length_to_int16_dtype_0, x = cache_length)[name = string("cast_5")];
+            tensor<int32, [1]> cast_572 = cast(dtype = cast_572_dtype_0, x = cache_length_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_572, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(256)];
+            tensor<int32, [1]> add_0 = add(x = cast_572, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_572, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            string cast_0_dtype_0 = const()[name = string("cast_0_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<int32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = select_0_to_int16)[name = string("cast_2")];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = cast_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(256)];
+            tensor<int32, [1]> add_0_1 = add(x = cast_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = cast_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0 = const()[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 128]> pos_cos_cast_fp16_cast_uint16_cast_uint16 = gather(axis = pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0, batch_dims = pos_cos_batch_dims_0, indices = select_0_1, validate_indices = pos_cos_validate_indices_0, x = position_embeddings_cos_weight_to_fp16_palettized)[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> obj_7_axes_0 = const()[name = string("obj_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_7_cast_fp16 = expand_dims(axes = obj_7_axes_0, x = pos_cos_cast_fp16_cast_uint16_cast_uint16)[name = string("obj_7_cast_fp16")];
+            int32 pos_sin_axis_0 = const()[name = string("pos_sin_axis_0"), val = int32(0)];
+            int32 pos_sin_batch_dims_0 = const()[name = string("pos_sin_batch_dims_0"), val = int32(0)];
+            bool pos_sin_validate_indices_0 = const()[name = string("pos_sin_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 128]> position_embeddings_sin_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [256, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33472))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66304))))[name = string("position_embeddings_sin_weight_to_fp16_palettized")];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_1")];
+            tensor<fp16, [1, 128]> pos_sin_cast_fp16_cast_uint16 = gather(axis = pos_sin_axis_0, batch_dims = pos_sin_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_validate_indices_0, x = position_embeddings_sin_weight_to_fp16_palettized)[name = string("pos_sin_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> obj_9_axes_0 = const()[name = string("obj_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_9_cast_fp16 = expand_dims(axes = obj_9_axes_0, x = pos_sin_cast_fp16_cast_uint16)[name = string("obj_9_cast_fp16")];
+            tensor<fp16, [1, 28672, 1, 256]> read_state_0 = read_state(input = self_attn_key_cache)[name = string("read_state_0")];
+            tensor<int32, [28]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [28]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66880)))];
+            int32 var_101_axis_0 = const()[name = string("op_101_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_7, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_8, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_9, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_10, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_11, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_12, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_13, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_14, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_15, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_16, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_17, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_18, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_19, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_20, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_21, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_22, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_23, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_24, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_25, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_26, tensor<fp16, [1, 1024, 1, 256]> var_101_cast_fp16_27 = split(axis = var_101_axis_0, split_sizes = tile_0, x = read_state_0)[name = string("op_101_cast_fp16")];
+            tensor<fp16, [1, 28672, 1, 256]> read_state_1 = read_state(input = self_attn_value_cache)[name = string("read_state_1")];
+            tensor<int32, [28]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [28]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67072)))];
+            int32 var_132_axis_0 = const()[name = string("op_132_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_7, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_8, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_9, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_10, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_11, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_12, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_13, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_14, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_15, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_16, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_17, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_18, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_19, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_20, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_21, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_22, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_23, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_24, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_25, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_26, tensor<fp16, [1, 1024, 1, 256]> var_132_cast_fp16_27 = split(axis = var_132_axis_0, split_sizes = tile_1, x = read_state_1)[name = string("op_132_cast_fp16")];
+            int32 var_162 = const()[name = string("op_162"), val = int32(3)];
+            int32 var_172 = const()[name = string("op_172"), val = int32(-2)];
+            int32 var_180 = const()[name = string("op_180"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = input_embeds, y = input_embeds)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_192_to_fp16 = const()[name = string("op_192_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_193_cast_fp16 = add(x = variance_1_cast_fp16, y = var_192_to_fp16)[name = string("op_193_cast_fp16")];
+            fp32 var_194_epsilon_0 = const()[name = string("op_194_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_194_cast_fp16 = rsqrt(epsilon = var_194_epsilon_0, x = var_193_cast_fp16)[name = string("op_194_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_1_cast_fp16 = mul(x = input_embeds, y = var_194_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67264)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_1_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("obj_1_cast_fp16")];
+            string query_1_pad_type_0 = const()[name = string("query_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_1_strides_0 = const()[name = string("query_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_1_pad_0 = const()[name = string("query_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_1_dilations_0 = const()[name = string("query_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_1_groups_0 = const()[name = string("query_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71424))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4265792))))[name = string("layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> layers_0_self_attn_q_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_q_proj_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4266368)))];
+            tensor<fp16, [1, 2048, 1, 1]> query_1_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_1_dilations_0, groups = query_1_groups_0, pad = query_1_pad_0, pad_type = query_1_pad_type_0, strides = query_1_strides_0, weight = layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("query_1_cast_fp16")];
+            string current_key_1_pad_type_0 = const()[name = string("current_key_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_1_strides_0 = const()[name = string("current_key_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_1_pad_0 = const()[name = string("current_key_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_1_dilations_0 = const()[name = string("current_key_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_1_groups_0 = const()[name = string("current_key_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4270528))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6367744))))[name = string("layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = conv(dilations = current_key_1_dilations_0, groups = current_key_1_groups_0, pad = current_key_1_pad_0, pad_type = current_key_1_pad_type_0, strides = current_key_1_strides_0, weight = layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            string current_value_1_pad_type_0 = const()[name = string("current_value_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_1_strides_0 = const()[name = string("current_value_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_1_pad_0 = const()[name = string("current_value_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_1_dilations_0 = const()[name = string("current_value_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_1_groups_0 = const()[name = string("current_value_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6368320))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8465536))))[name = string("layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> layers_0_self_attn_v_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_v_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8466112)))];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_1_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_1_dilations_0, groups = current_value_1_groups_0, pad = current_value_1_pad_0, pad_type = current_value_1_pad_type_0, strides = current_value_1_strides_0, weight = layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_value_1_cast_fp16")];
+            tensor<int32, [4]> var_231 = const()[name = string("op_231"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_1_cast_fp16 = reshape(shape = var_231, x = query_1_cast_fp16)[name = string("inputs_1_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_237_to_fp16 = const()[name = string("op_237_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_238_cast_fp16 = add(x = variance_3_cast_fp16, y = var_237_to_fp16)[name = string("op_238_cast_fp16")];
+            fp32 var_239_epsilon_0 = const()[name = string("op_239_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_239_cast_fp16 = rsqrt(epsilon = var_239_epsilon_0, x = var_238_cast_fp16)[name = string("op_239_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_239_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8468224)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_1_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("query_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_247 = const()[name = string("op_247"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_3_cast_fp16 = reshape(shape = var_247, x = current_key_1_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_253_to_fp16 = const()[name = string("op_253_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_254_cast_fp16 = add(x = variance_5_cast_fp16, y = var_253_to_fp16)[name = string("op_254_cast_fp16")];
+            fp32 var_255_epsilon_0 = const()[name = string("op_255_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_255_cast_fp16 = rsqrt(epsilon = var_255_epsilon_0, x = var_254_cast_fp16)[name = string("op_255_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_255_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8468544)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_1_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("current_key_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_273 = const()[name = string("op_273"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_1_cast_fp16 = reshape(shape = var_273, x = query_normed_1_cast_fp16)[name = string("mh_q_1_cast_fp16")];
+            tensor<int32, [4]> var_275 = const()[name = string("op_275"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_1_cast_fp16 = reshape(shape = var_275, x = current_key_normed_1_cast_fp16)[name = string("mh_k_1_cast_fp16")];
+            tensor<int32, [1]> cos_1_axes_0 = const()[name = string("cos_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> cos_1_cast_fp16 = expand_dims(axes = cos_1_axes_0, x = obj_7_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [1]> sin_1_axes_0 = const()[name = string("sin_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> sin_1_cast_fp16 = expand_dims(axes = sin_1_axes_0, x = obj_9_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_279_cast_fp16 = mul(x = mh_q_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_279_cast_fp16")];
+            tensor<int32, [4]> var_284_begin_0 = const()[name = string("op_284_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_284_end_0 = const()[name = string("op_284_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_284_end_mask_0 = const()[name = string("op_284_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_284_cast_fp16 = slice_by_index(begin = var_284_begin_0, end = var_284_end_0, end_mask = var_284_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_284_cast_fp16")];
+            tensor<int32, [4]> var_290_begin_0 = const()[name = string("op_290_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_290_end_0 = const()[name = string("op_290_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_290_end_mask_0 = const()[name = string("op_290_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_290_cast_fp16 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_290_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_292_cast_fp16 = mul(x = var_290_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_292_cast_fp16")];
+            bool var_294_interleave_0 = const()[name = string("op_294_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_294_cast_fp16 = concat(axis = var_172, interleave = var_294_interleave_0, values = (var_292_cast_fp16, var_284_cast_fp16))[name = string("op_294_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_295_cast_fp16 = mul(x = var_294_cast_fp16, y = sin_1_cast_fp16)[name = string("op_295_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_3_cast_fp16 = add(x = var_279_cast_fp16, y = var_295_cast_fp16)[name = string("mh_q_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_297_cast_fp16 = mul(x = mh_k_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_297_cast_fp16")];
+            tensor<int32, [4]> var_302_begin_0 = const()[name = string("op_302_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_302_end_0 = const()[name = string("op_302_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_302_end_mask_0 = const()[name = string("op_302_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_302_cast_fp16 = slice_by_index(begin = var_302_begin_0, end = var_302_end_0, end_mask = var_302_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_302_cast_fp16")];
+            tensor<int32, [4]> var_308_begin_0 = const()[name = string("op_308_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_308_end_0 = const()[name = string("op_308_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_308_end_mask_0 = const()[name = string("op_308_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_308_cast_fp16 = slice_by_index(begin = var_308_begin_0, end = var_308_end_0, end_mask = var_308_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_308_cast_fp16")];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_310_cast_fp16 = mul(x = var_308_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_310_cast_fp16")];
+            bool var_312_interleave_0 = const()[name = string("op_312_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_312_cast_fp16 = concat(axis = var_172, interleave = var_312_interleave_0, values = (var_310_cast_fp16, var_302_cast_fp16))[name = string("op_312_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_313_cast_fp16 = mul(x = var_312_cast_fp16, y = sin_1_cast_fp16)[name = string("op_313_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_3_cast_fp16 = add(x = var_297_cast_fp16, y = var_313_cast_fp16)[name = string("mh_k_3_cast_fp16")];
+            tensor<int32, [4]> var_317 = const()[name = string("op_317"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_317, x = mh_k_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<int32, [1]> var_320_axes_0 = const()[name = string("op_320_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_320_cast_fp16 = expand_dims(axes = var_320_axes_0, x = kv_cache_update_mask)[name = string("op_320_cast_fp16")];
+            tensor<int32, [1]> var_321_axes_0 = const()[name = string("op_321_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> var_321_cast_fp16 = expand_dims(axes = var_321_axes_0, x = var_320_cast_fp16)[name = string("op_321_cast_fp16")];
+            fp16 var_173_to_fp16 = const()[name = string("op_173_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 256]> var_323_cast_fp16 = sub(x = var_173_to_fp16, y = var_321_cast_fp16)[name = string("op_323_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_324_cast_fp16 = mul(x = var_101_cast_fp16_0, y = var_323_cast_fp16)[name = string("op_324_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_325_cast_fp16 = mul(x = current_key_3_cast_fp16, y = var_321_cast_fp16)[name = string("op_325_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_3_cast_fp16 = add(x = var_324_cast_fp16, y = var_325_cast_fp16)[name = string("key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_328_cast_fp16 = mul(x = var_132_cast_fp16_0, y = var_323_cast_fp16)[name = string("op_328_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_329_cast_fp16 = mul(x = current_value_1_cast_fp16, y = var_321_cast_fp16)[name = string("op_329_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_1_cast_fp16 = add(x = var_328_cast_fp16, y = var_329_cast_fp16)[name = string("value_1_cast_fp16")];
+            tensor<int32, [4]> var_333 = const()[name = string("op_333"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_1_cast_fp16 = reshape(shape = var_333, x = key_3_cast_fp16)[name = string("key_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_335 = const()[name = string("op_335"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_1_cast_fp16 = reshape(shape = var_335, x = value_1_cast_fp16)[name = string("value_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_338_begin_0 = const()[name = string("op_338_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_338_end_0 = const()[name = string("op_338_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_338_end_mask_0 = const()[name = string("op_338_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_338_cast_fp16 = slice_by_index(begin = var_338_begin_0, end = var_338_end_0, end_mask = var_338_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_338_cast_fp16")];
+            tensor<int32, [4]> var_342_begin_0 = const()[name = string("op_342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_342_end_0 = const()[name = string("op_342_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_342_end_mask_0 = const()[name = string("op_342_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_342_cast_fp16 = slice_by_index(begin = var_342_begin_0, end = var_342_end_0, end_mask = var_342_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_342_cast_fp16")];
+            tensor<int32, [4]> var_354_begin_0 = const()[name = string("op_354_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_354_end_0 = const()[name = string("op_354_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_354_end_mask_0 = const()[name = string("op_354_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_354_cast_fp16 = slice_by_index(begin = var_354_begin_0, end = var_354_end_0, end_mask = var_354_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_354_cast_fp16")];
+            tensor<int32, [4]> var_358_begin_0 = const()[name = string("op_358_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_358_end_0 = const()[name = string("op_358_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_358_end_mask_0 = const()[name = string("op_358_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_358_cast_fp16 = slice_by_index(begin = var_358_begin_0, end = var_358_end_0, end_mask = var_358_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_358_cast_fp16")];
+            tensor<int32, [4]> var_370_begin_0 = const()[name = string("op_370_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_370_end_0 = const()[name = string("op_370_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_370_end_mask_0 = const()[name = string("op_370_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_370_cast_fp16 = slice_by_index(begin = var_370_begin_0, end = var_370_end_0, end_mask = var_370_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_370_cast_fp16")];
+            tensor<int32, [4]> var_374_begin_0 = const()[name = string("op_374_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_374_end_0 = const()[name = string("op_374_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_374_end_mask_0 = const()[name = string("op_374_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_374_cast_fp16 = slice_by_index(begin = var_374_begin_0, end = var_374_end_0, end_mask = var_374_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_374_cast_fp16")];
+            tensor<int32, [4]> var_386_begin_0 = const()[name = string("op_386_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_386_end_0 = const()[name = string("op_386_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_386_end_mask_0 = const()[name = string("op_386_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_386_cast_fp16 = slice_by_index(begin = var_386_begin_0, end = var_386_end_0, end_mask = var_386_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_386_cast_fp16")];
+            tensor<int32, [4]> var_390_begin_0 = const()[name = string("op_390_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_390_end_0 = const()[name = string("op_390_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_390_end_mask_0 = const()[name = string("op_390_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_390_cast_fp16 = slice_by_index(begin = var_390_begin_0, end = var_390_end_0, end_mask = var_390_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_390_cast_fp16")];
+            tensor<int32, [4]> var_402_begin_0 = const()[name = string("op_402_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_402_end_0 = const()[name = string("op_402_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_402_end_mask_0 = const()[name = string("op_402_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_402_cast_fp16 = slice_by_index(begin = var_402_begin_0, end = var_402_end_0, end_mask = var_402_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_402_cast_fp16")];
+            tensor<int32, [4]> var_406_begin_0 = const()[name = string("op_406_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_406_end_0 = const()[name = string("op_406_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_406_end_mask_0 = const()[name = string("op_406_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_406_cast_fp16 = slice_by_index(begin = var_406_begin_0, end = var_406_end_0, end_mask = var_406_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_406_cast_fp16")];
+            tensor<int32, [4]> var_418_begin_0 = const()[name = string("op_418_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_418_end_0 = const()[name = string("op_418_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_418_end_mask_0 = const()[name = string("op_418_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_418_cast_fp16 = slice_by_index(begin = var_418_begin_0, end = var_418_end_0, end_mask = var_418_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_418_cast_fp16")];
+            tensor<int32, [4]> var_422_begin_0 = const()[name = string("op_422_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_422_end_0 = const()[name = string("op_422_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_422_end_mask_0 = const()[name = string("op_422_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_422_cast_fp16 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_422_cast_fp16")];
+            tensor<int32, [4]> var_434_begin_0 = const()[name = string("op_434_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_434_end_0 = const()[name = string("op_434_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_434_end_mask_0 = const()[name = string("op_434_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_434_cast_fp16 = slice_by_index(begin = var_434_begin_0, end = var_434_end_0, end_mask = var_434_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_434_cast_fp16")];
+            tensor<int32, [4]> var_438_begin_0 = const()[name = string("op_438_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_438_end_0 = const()[name = string("op_438_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_438_end_mask_0 = const()[name = string("op_438_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_438_cast_fp16 = slice_by_index(begin = var_438_begin_0, end = var_438_end_0, end_mask = var_438_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_438_cast_fp16")];
+            tensor<int32, [4]> var_450_begin_0 = const()[name = string("op_450_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_450_end_0 = const()[name = string("op_450_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_450_end_mask_0 = const()[name = string("op_450_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_450_cast_fp16 = slice_by_index(begin = var_450_begin_0, end = var_450_end_0, end_mask = var_450_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_450_cast_fp16")];
+            tensor<int32, [4]> var_454_begin_0 = const()[name = string("op_454_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_454_end_0 = const()[name = string("op_454_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_454_end_mask_0 = const()[name = string("op_454_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_454_cast_fp16 = slice_by_index(begin = var_454_begin_0, end = var_454_end_0, end_mask = var_454_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_454_cast_fp16")];
+            bool key_heads_3_interleave_0 = const()[name = string("key_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_3_cast_fp16 = concat(axis = var_180, interleave = key_heads_3_interleave_0, values = (var_338_cast_fp16, var_338_cast_fp16, var_354_cast_fp16, var_354_cast_fp16, var_370_cast_fp16, var_370_cast_fp16, var_386_cast_fp16, var_386_cast_fp16, var_402_cast_fp16, var_402_cast_fp16, var_418_cast_fp16, var_418_cast_fp16, var_434_cast_fp16, var_434_cast_fp16, var_450_cast_fp16, var_450_cast_fp16))[name = string("key_heads_3_cast_fp16")];
+            bool value_heads_3_interleave_0 = const()[name = string("value_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_3_cast_fp16 = concat(axis = var_180, interleave = value_heads_3_interleave_0, values = (var_342_cast_fp16, var_342_cast_fp16, var_358_cast_fp16, var_358_cast_fp16, var_374_cast_fp16, var_374_cast_fp16, var_390_cast_fp16, var_390_cast_fp16, var_406_cast_fp16, var_406_cast_fp16, var_422_cast_fp16, var_422_cast_fp16, var_438_cast_fp16, var_438_cast_fp16, var_454_cast_fp16, var_454_cast_fp16))[name = string("value_heads_3_cast_fp16")];
+            fp16 var_477_to_fp16 = const()[name = string("op_477_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_478_cast_fp16 = mul(x = mh_q_3_cast_fp16, y = var_477_to_fp16)[name = string("op_478_cast_fp16")];
+            bool mh_w_1_transpose_x_0 = const()[name = string("mh_w_1_transpose_x_0"), val = bool(true)];
+            bool mh_w_1_transpose_y_0 = const()[name = string("mh_w_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_1_cast_fp16 = matmul(transpose_x = mh_w_1_transpose_x_0, transpose_y = mh_w_1_transpose_y_0, x = var_478_cast_fp16, y = key_heads_3_cast_fp16)[name = string("mh_w_1_cast_fp16")];
+            tensor<int32, [1]> var_486_axes_0 = const()[name = string("op_486_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_486_cast_fp16 = expand_dims(axes = var_486_axes_0, x = key_padding_mask)[name = string("op_486_cast_fp16")];
+            tensor<int32, [1]> var_487_axes_0 = const()[name = string("op_487_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> var_487_cast_fp16 = expand_dims(axes = var_487_axes_0, x = var_486_cast_fp16)[name = string("op_487_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_3_cast_fp16 = add(x = mh_w_1_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_3_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_490_cast_fp16 = softmax(axis = var_162, x = mh_w_3_cast_fp16)[name = string("op_490_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = value_heads_3_cast_fp16, y = var_490_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_495 = const()[name = string("op_495"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = reshape(shape = var_495, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            string obj_11_pad_type_0 = const()[name = string("obj_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_11_strides_0 = const()[name = string("obj_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_11_pad_0 = const()[name = string("obj_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_11_dilations_0 = const()[name = string("obj_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_11_groups_0 = const()[name = string("obj_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_0_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8468864))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12663232))))[name = string("layers_0_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_11_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_11_dilations_0, groups = obj_11_groups_0, pad = obj_11_pad_0, pad_type = obj_11_pad_type_0, strides = obj_11_strides_0, weight = layers_0_self_attn_o_proj_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("obj_11_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_5_cast_fp16 = add(x = input_embeds, y = obj_11_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_513_to_fp16 = const()[name = string("op_513_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_514_cast_fp16 = add(x = variance_7_cast_fp16, y = var_513_to_fp16)[name = string("op_514_cast_fp16")];
+            fp32 var_515_epsilon_0 = const()[name = string("op_515_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_515_cast_fp16 = rsqrt(epsilon = var_515_epsilon_0, x = var_514_cast_fp16)[name = string("op_515_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_515_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12663808)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_3_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_3_cast_fp16")];
+            string input_5_pad_type_0 = const()[name = string("input_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_5_strides_0 = const()[name = string("input_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_5_pad_0 = const()[name = string("input_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_5_dilations_0 = const()[name = string("input_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_5_groups_0 = const()[name = string("input_5_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12667968))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25250944))))[name = string("layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_5_cast_fp16 = conv(dilations = input_5_dilations_0, groups = input_5_groups_0, pad = input_5_pad_0, pad_type = input_5_pad_type_0, strides = input_5_strides_0, weight = layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_529_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_529_cast_fp16")];
+            string var_535_pad_type_0 = const()[name = string("op_535_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_535_strides_0 = const()[name = string("op_535_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_535_pad_0 = const()[name = string("op_535_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_535_dilations_0 = const()[name = string("op_535_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_535_groups_0 = const()[name = string("op_535_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25251520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37834496))))[name = string("layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_535_cast_fp16 = conv(dilations = var_535_dilations_0, groups = var_535_groups_0, pad = var_535_pad_0, pad_type = var_535_pad_type_0, strides = var_535_strides_0, weight = layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("op_535_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_7_cast_fp16 = mul(x = var_529_cast_fp16, y = var_535_cast_fp16)[name = string("input_7_cast_fp16")];
+            string hidden_states_9_pad_type_0 = const()[name = string("hidden_states_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_9_strides_0 = const()[name = string("hidden_states_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_9_pad_0 = const()[name = string("hidden_states_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_9_dilations_0 = const()[name = string("hidden_states_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_9_groups_0 = const()[name = string("hidden_states_9_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_0_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37835072))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50418048))))[name = string("layers_0_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_9_cast_fp16 = conv(dilations = hidden_states_9_dilations_0, groups = hidden_states_9_groups_0, pad = hidden_states_9_pad_0, pad_type = hidden_states_9_pad_type_0, strides = hidden_states_9_strides_0, weight = layers_0_mlp_down_proj_weight_to_fp16_palettized, x = input_7_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            int32 var_549 = const()[name = string("op_549"), val = int32(3)];
+            int32 var_559 = const()[name = string("op_559"), val = int32(-2)];
+            int32 var_567 = const()[name = string("op_567"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_579_to_fp16 = const()[name = string("op_579_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_580_cast_fp16 = add(x = variance_9_cast_fp16, y = var_579_to_fp16)[name = string("op_580_cast_fp16")];
+            fp32 var_581_epsilon_0 = const()[name = string("op_581_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_581_cast_fp16 = rsqrt(epsilon = var_581_epsilon_0, x = var_580_cast_fp16)[name = string("op_581_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_581_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50418624)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_13_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_11_cast_fp16)[name = string("obj_13_cast_fp16")];
+            string query_7_pad_type_0 = const()[name = string("query_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_7_strides_0 = const()[name = string("query_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_7_pad_0 = const()[name = string("query_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_7_dilations_0 = const()[name = string("query_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_7_groups_0 = const()[name = string("query_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50422784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54617152))))[name = string("layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_7_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_7_dilations_0, groups = query_7_groups_0, pad = query_7_pad_0, pad_type = query_7_pad_type_0, strides = query_7_strides_0, weight = layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("query_7_cast_fp16")];
+            string current_key_5_pad_type_0 = const()[name = string("current_key_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_5_strides_0 = const()[name = string("current_key_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_5_pad_0 = const()[name = string("current_key_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_5_dilations_0 = const()[name = string("current_key_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_5_groups_0 = const()[name = string("current_key_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54617728))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56714944))))[name = string("layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = conv(dilations = current_key_5_dilations_0, groups = current_key_5_groups_0, pad = current_key_5_pad_0, pad_type = current_key_5_pad_type_0, strides = current_key_5_strides_0, weight = layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            string current_value_3_pad_type_0 = const()[name = string("current_value_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_3_strides_0 = const()[name = string("current_value_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_3_pad_0 = const()[name = string("current_value_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_3_dilations_0 = const()[name = string("current_value_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_3_groups_0 = const()[name = string("current_value_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56715520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58812736))))[name = string("layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_3_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_3_dilations_0, groups = current_value_3_groups_0, pad = current_value_3_pad_0, pad_type = current_value_3_pad_type_0, strides = current_value_3_strides_0, weight = layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_value_3_cast_fp16")];
+            tensor<int32, [4]> var_618 = const()[name = string("op_618"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_9_cast_fp16 = reshape(shape = var_618, x = query_7_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_624_to_fp16 = const()[name = string("op_624_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_625_cast_fp16 = add(x = variance_11_cast_fp16, y = var_624_to_fp16)[name = string("op_625_cast_fp16")];
+            fp32 var_626_epsilon_0 = const()[name = string("op_626_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_626_cast_fp16 = rsqrt(epsilon = var_626_epsilon_0, x = var_625_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_626_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58813312)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_3_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_13_cast_fp16)[name = string("query_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_634 = const()[name = string("op_634"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_11_cast_fp16 = reshape(shape = var_634, x = current_key_5_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_640_to_fp16 = const()[name = string("op_640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_641_cast_fp16 = add(x = variance_13_cast_fp16, y = var_640_to_fp16)[name = string("op_641_cast_fp16")];
+            fp32 var_642_epsilon_0 = const()[name = string("op_642_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_642_cast_fp16 = rsqrt(epsilon = var_642_epsilon_0, x = var_641_cast_fp16)[name = string("op_642_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_642_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58813632)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_3_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_15_cast_fp16)[name = string("current_key_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_660 = const()[name = string("op_660"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_7_cast_fp16 = reshape(shape = var_660, x = query_normed_3_cast_fp16)[name = string("mh_q_7_cast_fp16")];
+            tensor<int32, [4]> var_662 = const()[name = string("op_662"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_5_cast_fp16 = reshape(shape = var_662, x = current_key_normed_3_cast_fp16)[name = string("mh_k_5_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_666_cast_fp16 = mul(x = mh_q_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_666_cast_fp16")];
+            tensor<int32, [4]> var_671_begin_0 = const()[name = string("op_671_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_671_end_0 = const()[name = string("op_671_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_671_end_mask_0 = const()[name = string("op_671_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_671_cast_fp16 = slice_by_index(begin = var_671_begin_0, end = var_671_end_0, end_mask = var_671_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_671_cast_fp16")];
+            tensor<int32, [4]> var_677_begin_0 = const()[name = string("op_677_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_677_end_0 = const()[name = string("op_677_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_677_end_mask_0 = const()[name = string("op_677_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_677_cast_fp16 = slice_by_index(begin = var_677_begin_0, end = var_677_end_0, end_mask = var_677_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_677_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_679_cast_fp16 = mul(x = var_677_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_679_cast_fp16")];
+            bool var_681_interleave_0 = const()[name = string("op_681_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_681_cast_fp16 = concat(axis = var_559, interleave = var_681_interleave_0, values = (var_679_cast_fp16, var_671_cast_fp16))[name = string("op_681_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_682_cast_fp16 = mul(x = var_681_cast_fp16, y = sin_1_cast_fp16)[name = string("op_682_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_9_cast_fp16 = add(x = var_666_cast_fp16, y = var_682_cast_fp16)[name = string("mh_q_9_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_684_cast_fp16 = mul(x = mh_k_5_cast_fp16, y = cos_1_cast_fp16)[name = string("op_684_cast_fp16")];
+            tensor<int32, [4]> var_689_begin_0 = const()[name = string("op_689_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_689_end_0 = const()[name = string("op_689_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_689_end_mask_0 = const()[name = string("op_689_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_689_cast_fp16 = slice_by_index(begin = var_689_begin_0, end = var_689_end_0, end_mask = var_689_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_689_cast_fp16")];
+            tensor<int32, [4]> var_695_begin_0 = const()[name = string("op_695_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_695_end_0 = const()[name = string("op_695_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_695_end_mask_0 = const()[name = string("op_695_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_695_cast_fp16 = slice_by_index(begin = var_695_begin_0, end = var_695_end_0, end_mask = var_695_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_695_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_697_cast_fp16 = mul(x = var_695_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_697_cast_fp16")];
+            bool var_699_interleave_0 = const()[name = string("op_699_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_699_cast_fp16 = concat(axis = var_559, interleave = var_699_interleave_0, values = (var_697_cast_fp16, var_689_cast_fp16))[name = string("op_699_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_700_cast_fp16 = mul(x = var_699_cast_fp16, y = sin_1_cast_fp16)[name = string("op_700_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_7_cast_fp16 = add(x = var_684_cast_fp16, y = var_700_cast_fp16)[name = string("mh_k_7_cast_fp16")];
+            tensor<int32, [4]> var_704 = const()[name = string("op_704"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_704, x = mh_k_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_711_cast_fp16 = mul(x = var_101_cast_fp16_1, y = var_323_cast_fp16)[name = string("op_711_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_712_cast_fp16 = mul(x = current_key_7_cast_fp16, y = var_321_cast_fp16)[name = string("op_712_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_9_cast_fp16 = add(x = var_711_cast_fp16, y = var_712_cast_fp16)[name = string("key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_715_cast_fp16 = mul(x = var_132_cast_fp16_1, y = var_323_cast_fp16)[name = string("op_715_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_716_cast_fp16 = mul(x = current_value_3_cast_fp16, y = var_321_cast_fp16)[name = string("op_716_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_5_cast_fp16 = add(x = var_715_cast_fp16, y = var_716_cast_fp16)[name = string("value_5_cast_fp16")];
+            tensor<int32, [4]> var_720 = const()[name = string("op_720"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_5_cast_fp16 = reshape(shape = var_720, x = key_9_cast_fp16)[name = string("key_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_722 = const()[name = string("op_722"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_5_cast_fp16 = reshape(shape = var_722, x = value_5_cast_fp16)[name = string("value_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_725_begin_0 = const()[name = string("op_725_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_725_end_0 = const()[name = string("op_725_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_725_end_mask_0 = const()[name = string("op_725_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_725_cast_fp16 = slice_by_index(begin = var_725_begin_0, end = var_725_end_0, end_mask = var_725_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_725_cast_fp16")];
+            tensor<int32, [4]> var_729_begin_0 = const()[name = string("op_729_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_729_end_0 = const()[name = string("op_729_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_729_end_mask_0 = const()[name = string("op_729_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_729_cast_fp16 = slice_by_index(begin = var_729_begin_0, end = var_729_end_0, end_mask = var_729_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_729_cast_fp16")];
+            tensor<int32, [4]> var_741_begin_0 = const()[name = string("op_741_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_741_end_0 = const()[name = string("op_741_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_741_end_mask_0 = const()[name = string("op_741_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_741_cast_fp16 = slice_by_index(begin = var_741_begin_0, end = var_741_end_0, end_mask = var_741_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_741_cast_fp16")];
+            tensor<int32, [4]> var_745_begin_0 = const()[name = string("op_745_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_745_end_0 = const()[name = string("op_745_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_745_end_mask_0 = const()[name = string("op_745_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_745_cast_fp16 = slice_by_index(begin = var_745_begin_0, end = var_745_end_0, end_mask = var_745_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_745_cast_fp16")];
+            tensor<int32, [4]> var_757_begin_0 = const()[name = string("op_757_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_757_end_0 = const()[name = string("op_757_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_757_end_mask_0 = const()[name = string("op_757_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_757_cast_fp16 = slice_by_index(begin = var_757_begin_0, end = var_757_end_0, end_mask = var_757_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_757_cast_fp16")];
+            tensor<int32, [4]> var_761_begin_0 = const()[name = string("op_761_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_761_end_0 = const()[name = string("op_761_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_761_end_mask_0 = const()[name = string("op_761_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_761_cast_fp16 = slice_by_index(begin = var_761_begin_0, end = var_761_end_0, end_mask = var_761_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_761_cast_fp16")];
+            tensor<int32, [4]> var_773_begin_0 = const()[name = string("op_773_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_773_end_0 = const()[name = string("op_773_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_773_end_mask_0 = const()[name = string("op_773_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_773_cast_fp16 = slice_by_index(begin = var_773_begin_0, end = var_773_end_0, end_mask = var_773_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_773_cast_fp16")];
+            tensor<int32, [4]> var_777_begin_0 = const()[name = string("op_777_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_777_end_0 = const()[name = string("op_777_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_777_end_mask_0 = const()[name = string("op_777_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_777_cast_fp16 = slice_by_index(begin = var_777_begin_0, end = var_777_end_0, end_mask = var_777_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_777_cast_fp16")];
+            tensor<int32, [4]> var_789_begin_0 = const()[name = string("op_789_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_789_end_0 = const()[name = string("op_789_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_789_end_mask_0 = const()[name = string("op_789_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_789_cast_fp16 = slice_by_index(begin = var_789_begin_0, end = var_789_end_0, end_mask = var_789_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_789_cast_fp16")];
+            tensor<int32, [4]> var_793_begin_0 = const()[name = string("op_793_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_793_end_0 = const()[name = string("op_793_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_793_end_mask_0 = const()[name = string("op_793_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_793_cast_fp16 = slice_by_index(begin = var_793_begin_0, end = var_793_end_0, end_mask = var_793_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_793_cast_fp16")];
+            tensor<int32, [4]> var_805_begin_0 = const()[name = string("op_805_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_805_end_0 = const()[name = string("op_805_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_805_end_mask_0 = const()[name = string("op_805_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_805_cast_fp16 = slice_by_index(begin = var_805_begin_0, end = var_805_end_0, end_mask = var_805_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_805_cast_fp16")];
+            tensor<int32, [4]> var_809_begin_0 = const()[name = string("op_809_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_809_end_0 = const()[name = string("op_809_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_809_end_mask_0 = const()[name = string("op_809_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_809_cast_fp16 = slice_by_index(begin = var_809_begin_0, end = var_809_end_0, end_mask = var_809_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_809_cast_fp16")];
+            tensor<int32, [4]> var_821_begin_0 = const()[name = string("op_821_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_821_end_0 = const()[name = string("op_821_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_821_end_mask_0 = const()[name = string("op_821_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_821_cast_fp16 = slice_by_index(begin = var_821_begin_0, end = var_821_end_0, end_mask = var_821_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_821_cast_fp16")];
+            tensor<int32, [4]> var_825_begin_0 = const()[name = string("op_825_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_825_end_0 = const()[name = string("op_825_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_825_end_mask_0 = const()[name = string("op_825_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_825_cast_fp16 = slice_by_index(begin = var_825_begin_0, end = var_825_end_0, end_mask = var_825_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_825_cast_fp16")];
+            tensor<int32, [4]> var_837_begin_0 = const()[name = string("op_837_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_837_end_0 = const()[name = string("op_837_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_837_end_mask_0 = const()[name = string("op_837_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_837_cast_fp16 = slice_by_index(begin = var_837_begin_0, end = var_837_end_0, end_mask = var_837_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_837_cast_fp16")];
+            tensor<int32, [4]> var_841_begin_0 = const()[name = string("op_841_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_841_end_0 = const()[name = string("op_841_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_841_end_mask_0 = const()[name = string("op_841_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_841_cast_fp16 = slice_by_index(begin = var_841_begin_0, end = var_841_end_0, end_mask = var_841_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_841_cast_fp16")];
+            bool key_heads_7_interleave_0 = const()[name = string("key_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_7_cast_fp16 = concat(axis = var_567, interleave = key_heads_7_interleave_0, values = (var_725_cast_fp16, var_725_cast_fp16, var_741_cast_fp16, var_741_cast_fp16, var_757_cast_fp16, var_757_cast_fp16, var_773_cast_fp16, var_773_cast_fp16, var_789_cast_fp16, var_789_cast_fp16, var_805_cast_fp16, var_805_cast_fp16, var_821_cast_fp16, var_821_cast_fp16, var_837_cast_fp16, var_837_cast_fp16))[name = string("key_heads_7_cast_fp16")];
+            bool value_heads_7_interleave_0 = const()[name = string("value_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_7_cast_fp16 = concat(axis = var_567, interleave = value_heads_7_interleave_0, values = (var_729_cast_fp16, var_729_cast_fp16, var_745_cast_fp16, var_745_cast_fp16, var_761_cast_fp16, var_761_cast_fp16, var_777_cast_fp16, var_777_cast_fp16, var_793_cast_fp16, var_793_cast_fp16, var_809_cast_fp16, var_809_cast_fp16, var_825_cast_fp16, var_825_cast_fp16, var_841_cast_fp16, var_841_cast_fp16))[name = string("value_heads_7_cast_fp16")];
+            fp16 var_864_to_fp16 = const()[name = string("op_864_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_865_cast_fp16 = mul(x = mh_q_9_cast_fp16, y = var_864_to_fp16)[name = string("op_865_cast_fp16")];
+            bool mh_w_5_transpose_x_0 = const()[name = string("mh_w_5_transpose_x_0"), val = bool(true)];
+            bool mh_w_5_transpose_y_0 = const()[name = string("mh_w_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_5_cast_fp16 = matmul(transpose_x = mh_w_5_transpose_x_0, transpose_y = mh_w_5_transpose_y_0, x = var_865_cast_fp16, y = key_heads_7_cast_fp16)[name = string("mh_w_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_7_cast_fp16 = add(x = mh_w_5_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_7_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_877_cast_fp16 = softmax(axis = var_549, x = mh_w_7_cast_fp16)[name = string("op_877_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = value_heads_7_cast_fp16, y = var_877_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_882 = const()[name = string("op_882"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_9_cast_fp16 = reshape(shape = var_882, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            string obj_19_pad_type_0 = const()[name = string("obj_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_19_strides_0 = const()[name = string("obj_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_19_pad_0 = const()[name = string("obj_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_19_dilations_0 = const()[name = string("obj_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_19_groups_0 = const()[name = string("obj_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_1_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58813952))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63008320))))[name = string("layers_1_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_19_dilations_0, groups = obj_19_groups_0, pad = obj_19_pad_0, pad_type = obj_19_pad_type_0, strides = obj_19_strides_0, weight = layers_1_self_attn_o_proj_weight_to_fp16_palettized, x = input_9_cast_fp16)[name = string("obj_19_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_13_cast_fp16 = add(x = inputs_7_cast_fp16, y = obj_19_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_900_to_fp16 = const()[name = string("op_900_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_901_cast_fp16 = add(x = variance_15_cast_fp16, y = var_900_to_fp16)[name = string("op_901_cast_fp16")];
+            fp32 var_902_epsilon_0 = const()[name = string("op_902_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_902_cast_fp16 = rsqrt(epsilon = var_902_epsilon_0, x = var_901_cast_fp16)[name = string("op_902_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_902_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63008896)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_11_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_17_cast_fp16)[name = string("input_11_cast_fp16")];
+            string input_13_pad_type_0 = const()[name = string("input_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_13_strides_0 = const()[name = string("input_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_13_pad_0 = const()[name = string("input_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_13_dilations_0 = const()[name = string("input_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_13_groups_0 = const()[name = string("input_13_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63013056))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75596032))))[name = string("layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_13_cast_fp16 = conv(dilations = input_13_dilations_0, groups = input_13_groups_0, pad = input_13_pad_0, pad_type = input_13_pad_type_0, strides = input_13_strides_0, weight = layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_916_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_916_cast_fp16")];
+            string var_922_pad_type_0 = const()[name = string("op_922_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_922_strides_0 = const()[name = string("op_922_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_922_pad_0 = const()[name = string("op_922_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_922_dilations_0 = const()[name = string("op_922_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_922_groups_0 = const()[name = string("op_922_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75596608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(88179584))))[name = string("layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_922_cast_fp16 = conv(dilations = var_922_dilations_0, groups = var_922_groups_0, pad = var_922_pad_0, pad_type = var_922_pad_type_0, strides = var_922_strides_0, weight = layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("op_922_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_15_cast_fp16 = mul(x = var_916_cast_fp16, y = var_922_cast_fp16)[name = string("input_15_cast_fp16")];
+            string hidden_states_19_pad_type_0 = const()[name = string("hidden_states_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_19_strides_0 = const()[name = string("hidden_states_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_19_pad_0 = const()[name = string("hidden_states_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_19_dilations_0 = const()[name = string("hidden_states_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_19_groups_0 = const()[name = string("hidden_states_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_1_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(88180160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100763136))))[name = string("layers_1_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_19_cast_fp16 = conv(dilations = hidden_states_19_dilations_0, groups = hidden_states_19_groups_0, pad = hidden_states_19_pad_0, pad_type = hidden_states_19_pad_type_0, strides = hidden_states_19_strides_0, weight = layers_1_mlp_down_proj_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            int32 var_936 = const()[name = string("op_936"), val = int32(3)];
+            int32 var_946 = const()[name = string("op_946"), val = int32(-2)];
+            int32 var_954 = const()[name = string("op_954"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_966_to_fp16 = const()[name = string("op_966_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_967_cast_fp16 = add(x = variance_17_cast_fp16, y = var_966_to_fp16)[name = string("op_967_cast_fp16")];
+            fp32 var_968_epsilon_0 = const()[name = string("op_968_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_968_cast_fp16 = rsqrt(epsilon = var_968_epsilon_0, x = var_967_cast_fp16)[name = string("op_968_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_968_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100763712)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_21_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_21_cast_fp16)[name = string("obj_21_cast_fp16")];
+            string query_13_pad_type_0 = const()[name = string("query_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_13_strides_0 = const()[name = string("query_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_13_pad_0 = const()[name = string("query_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_13_dilations_0 = const()[name = string("query_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_13_groups_0 = const()[name = string("query_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100767872))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104962240))))[name = string("layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_13_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_13_dilations_0, groups = query_13_groups_0, pad = query_13_pad_0, pad_type = query_13_pad_type_0, strides = query_13_strides_0, weight = layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("query_13_cast_fp16")];
+            string current_key_9_pad_type_0 = const()[name = string("current_key_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_9_strides_0 = const()[name = string("current_key_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_9_pad_0 = const()[name = string("current_key_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_9_dilations_0 = const()[name = string("current_key_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_9_groups_0 = const()[name = string("current_key_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104962816))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(107060032))))[name = string("layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = conv(dilations = current_key_9_dilations_0, groups = current_key_9_groups_0, pad = current_key_9_pad_0, pad_type = current_key_9_pad_type_0, strides = current_key_9_strides_0, weight = layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            string current_value_5_pad_type_0 = const()[name = string("current_value_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_5_strides_0 = const()[name = string("current_value_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_5_pad_0 = const()[name = string("current_value_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_5_dilations_0 = const()[name = string("current_value_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_5_groups_0 = const()[name = string("current_value_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(107060608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109157824))))[name = string("layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_5_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_5_dilations_0, groups = current_value_5_groups_0, pad = current_value_5_pad_0, pad_type = current_value_5_pad_type_0, strides = current_value_5_strides_0, weight = layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_value_5_cast_fp16")];
+            tensor<int32, [4]> var_1005 = const()[name = string("op_1005"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_17_cast_fp16 = reshape(shape = var_1005, x = query_13_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_1011_to_fp16 = const()[name = string("op_1011_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1012_cast_fp16 = add(x = variance_19_cast_fp16, y = var_1011_to_fp16)[name = string("op_1012_cast_fp16")];
+            fp32 var_1013_epsilon_0 = const()[name = string("op_1013_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1013_cast_fp16 = rsqrt(epsilon = var_1013_epsilon_0, x = var_1012_cast_fp16)[name = string("op_1013_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_1013_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109158400)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_5_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_23_cast_fp16)[name = string("query_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_1021 = const()[name = string("op_1021"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_19_cast_fp16 = reshape(shape = var_1021, x = current_key_9_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_1027_to_fp16 = const()[name = string("op_1027_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1028_cast_fp16 = add(x = variance_21_cast_fp16, y = var_1027_to_fp16)[name = string("op_1028_cast_fp16")];
+            fp32 var_1029_epsilon_0 = const()[name = string("op_1029_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1029_cast_fp16 = rsqrt(epsilon = var_1029_epsilon_0, x = var_1028_cast_fp16)[name = string("op_1029_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_1029_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109158720)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_5_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_25_cast_fp16)[name = string("current_key_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_1047 = const()[name = string("op_1047"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_13_cast_fp16 = reshape(shape = var_1047, x = query_normed_5_cast_fp16)[name = string("mh_q_13_cast_fp16")];
+            tensor<int32, [4]> var_1049 = const()[name = string("op_1049"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_9_cast_fp16 = reshape(shape = var_1049, x = current_key_normed_5_cast_fp16)[name = string("mh_k_9_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1053_cast_fp16 = mul(x = mh_q_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1053_cast_fp16")];
+            tensor<int32, [4]> var_1058_begin_0 = const()[name = string("op_1058_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1058_end_0 = const()[name = string("op_1058_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1058_end_mask_0 = const()[name = string("op_1058_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1058_cast_fp16 = slice_by_index(begin = var_1058_begin_0, end = var_1058_end_0, end_mask = var_1058_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1058_cast_fp16")];
+            tensor<int32, [4]> var_1064_begin_0 = const()[name = string("op_1064_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1064_end_0 = const()[name = string("op_1064_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1064_end_mask_0 = const()[name = string("op_1064_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1064_cast_fp16 = slice_by_index(begin = var_1064_begin_0, end = var_1064_end_0, end_mask = var_1064_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1064_cast_fp16")];
+            fp16 const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1066_cast_fp16 = mul(x = var_1064_cast_fp16, y = const_63_promoted_to_fp16)[name = string("op_1066_cast_fp16")];
+            bool var_1068_interleave_0 = const()[name = string("op_1068_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1068_cast_fp16 = concat(axis = var_946, interleave = var_1068_interleave_0, values = (var_1066_cast_fp16, var_1058_cast_fp16))[name = string("op_1068_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1069_cast_fp16 = mul(x = var_1068_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1069_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_15_cast_fp16 = add(x = var_1053_cast_fp16, y = var_1069_cast_fp16)[name = string("mh_q_15_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1071_cast_fp16 = mul(x = mh_k_9_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1071_cast_fp16")];
+            tensor<int32, [4]> var_1076_begin_0 = const()[name = string("op_1076_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1076_end_0 = const()[name = string("op_1076_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1076_end_mask_0 = const()[name = string("op_1076_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1076_cast_fp16 = slice_by_index(begin = var_1076_begin_0, end = var_1076_end_0, end_mask = var_1076_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1076_cast_fp16")];
+            tensor<int32, [4]> var_1082_begin_0 = const()[name = string("op_1082_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1082_end_0 = const()[name = string("op_1082_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1082_end_mask_0 = const()[name = string("op_1082_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1082_cast_fp16 = slice_by_index(begin = var_1082_begin_0, end = var_1082_end_0, end_mask = var_1082_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1082_cast_fp16")];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1084_cast_fp16 = mul(x = var_1082_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_1084_cast_fp16")];
+            bool var_1086_interleave_0 = const()[name = string("op_1086_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1086_cast_fp16 = concat(axis = var_946, interleave = var_1086_interleave_0, values = (var_1084_cast_fp16, var_1076_cast_fp16))[name = string("op_1086_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1087_cast_fp16 = mul(x = var_1086_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1087_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_11_cast_fp16 = add(x = var_1071_cast_fp16, y = var_1087_cast_fp16)[name = string("mh_k_11_cast_fp16")];
+            tensor<int32, [4]> var_1091 = const()[name = string("op_1091"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1091, x = mh_k_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1098_cast_fp16 = mul(x = var_101_cast_fp16_2, y = var_323_cast_fp16)[name = string("op_1098_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1099_cast_fp16 = mul(x = current_key_11_cast_fp16, y = var_321_cast_fp16)[name = string("op_1099_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_15_cast_fp16 = add(x = var_1098_cast_fp16, y = var_1099_cast_fp16)[name = string("key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1102_cast_fp16 = mul(x = var_132_cast_fp16_2, y = var_323_cast_fp16)[name = string("op_1102_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1103_cast_fp16 = mul(x = current_value_5_cast_fp16, y = var_321_cast_fp16)[name = string("op_1103_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_9_cast_fp16 = add(x = var_1102_cast_fp16, y = var_1103_cast_fp16)[name = string("value_9_cast_fp16")];
+            tensor<int32, [4]> var_1107 = const()[name = string("op_1107"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_9_cast_fp16 = reshape(shape = var_1107, x = key_15_cast_fp16)[name = string("key_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1109 = const()[name = string("op_1109"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_9_cast_fp16 = reshape(shape = var_1109, x = value_9_cast_fp16)[name = string("value_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = string("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = string("op_1112_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = string("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1116_begin_0 = const()[name = string("op_1116_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1116_end_0 = const()[name = string("op_1116_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1116_end_mask_0 = const()[name = string("op_1116_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1116_cast_fp16 = slice_by_index(begin = var_1116_begin_0, end = var_1116_end_0, end_mask = var_1116_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1116_cast_fp16")];
+            tensor<int32, [4]> var_1128_begin_0 = const()[name = string("op_1128_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1128_end_0 = const()[name = string("op_1128_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1128_end_mask_0 = const()[name = string("op_1128_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1128_cast_fp16 = slice_by_index(begin = var_1128_begin_0, end = var_1128_end_0, end_mask = var_1128_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1128_cast_fp16")];
+            tensor<int32, [4]> var_1132_begin_0 = const()[name = string("op_1132_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1132_end_0 = const()[name = string("op_1132_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1132_end_mask_0 = const()[name = string("op_1132_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1132_cast_fp16 = slice_by_index(begin = var_1132_begin_0, end = var_1132_end_0, end_mask = var_1132_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1132_cast_fp16")];
+            tensor<int32, [4]> var_1144_begin_0 = const()[name = string("op_1144_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1144_end_0 = const()[name = string("op_1144_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1144_end_mask_0 = const()[name = string("op_1144_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1144_cast_fp16 = slice_by_index(begin = var_1144_begin_0, end = var_1144_end_0, end_mask = var_1144_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1144_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = string("op_1148_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = string("op_1148_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = string("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = string("op_1160_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = string("op_1160_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = string("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1164_begin_0 = const()[name = string("op_1164_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1164_end_0 = const()[name = string("op_1164_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1164_end_mask_0 = const()[name = string("op_1164_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1164_cast_fp16 = slice_by_index(begin = var_1164_begin_0, end = var_1164_end_0, end_mask = var_1164_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1164_cast_fp16")];
+            tensor<int32, [4]> var_1176_begin_0 = const()[name = string("op_1176_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1176_end_0 = const()[name = string("op_1176_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1176_end_mask_0 = const()[name = string("op_1176_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1176_cast_fp16 = slice_by_index(begin = var_1176_begin_0, end = var_1176_end_0, end_mask = var_1176_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1176_cast_fp16")];
+            tensor<int32, [4]> var_1180_begin_0 = const()[name = string("op_1180_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1180_end_0 = const()[name = string("op_1180_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1180_end_mask_0 = const()[name = string("op_1180_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1180_cast_fp16 = slice_by_index(begin = var_1180_begin_0, end = var_1180_end_0, end_mask = var_1180_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1180_cast_fp16")];
+            tensor<int32, [4]> var_1192_begin_0 = const()[name = string("op_1192_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1192_end_0 = const()[name = string("op_1192_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1192_end_mask_0 = const()[name = string("op_1192_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1192_cast_fp16 = slice_by_index(begin = var_1192_begin_0, end = var_1192_end_0, end_mask = var_1192_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1192_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = string("op_1196_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = string("op_1196_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = string("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1196_cast_fp16")];
+            tensor<int32, [4]> var_1208_begin_0 = const()[name = string("op_1208_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1208_end_0 = const()[name = string("op_1208_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1208_end_mask_0 = const()[name = string("op_1208_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1208_cast_fp16 = slice_by_index(begin = var_1208_begin_0, end = var_1208_end_0, end_mask = var_1208_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1208_cast_fp16")];
+            tensor<int32, [4]> var_1212_begin_0 = const()[name = string("op_1212_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1212_end_0 = const()[name = string("op_1212_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1212_end_mask_0 = const()[name = string("op_1212_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1212_cast_fp16 = slice_by_index(begin = var_1212_begin_0, end = var_1212_end_0, end_mask = var_1212_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1212_cast_fp16")];
+            tensor<int32, [4]> var_1224_begin_0 = const()[name = string("op_1224_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1224_end_0 = const()[name = string("op_1224_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1224_end_mask_0 = const()[name = string("op_1224_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1224_cast_fp16 = slice_by_index(begin = var_1224_begin_0, end = var_1224_end_0, end_mask = var_1224_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1224_cast_fp16")];
+            tensor<int32, [4]> var_1228_begin_0 = const()[name = string("op_1228_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1228_end_0 = const()[name = string("op_1228_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1228_end_mask_0 = const()[name = string("op_1228_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1228_cast_fp16 = slice_by_index(begin = var_1228_begin_0, end = var_1228_end_0, end_mask = var_1228_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1228_cast_fp16")];
+            bool key_heads_11_interleave_0 = const()[name = string("key_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_11_cast_fp16 = concat(axis = var_954, interleave = key_heads_11_interleave_0, values = (var_1112_cast_fp16, var_1112_cast_fp16, var_1128_cast_fp16, var_1128_cast_fp16, var_1144_cast_fp16, var_1144_cast_fp16, var_1160_cast_fp16, var_1160_cast_fp16, var_1176_cast_fp16, var_1176_cast_fp16, var_1192_cast_fp16, var_1192_cast_fp16, var_1208_cast_fp16, var_1208_cast_fp16, var_1224_cast_fp16, var_1224_cast_fp16))[name = string("key_heads_11_cast_fp16")];
+            bool value_heads_11_interleave_0 = const()[name = string("value_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_11_cast_fp16 = concat(axis = var_954, interleave = value_heads_11_interleave_0, values = (var_1116_cast_fp16, var_1116_cast_fp16, var_1132_cast_fp16, var_1132_cast_fp16, var_1148_cast_fp16, var_1148_cast_fp16, var_1164_cast_fp16, var_1164_cast_fp16, var_1180_cast_fp16, var_1180_cast_fp16, var_1196_cast_fp16, var_1196_cast_fp16, var_1212_cast_fp16, var_1212_cast_fp16, var_1228_cast_fp16, var_1228_cast_fp16))[name = string("value_heads_11_cast_fp16")];
+            fp16 var_1251_to_fp16 = const()[name = string("op_1251_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1252_cast_fp16 = mul(x = mh_q_15_cast_fp16, y = var_1251_to_fp16)[name = string("op_1252_cast_fp16")];
+            bool mh_w_9_transpose_x_0 = const()[name = string("mh_w_9_transpose_x_0"), val = bool(true)];
+            bool mh_w_9_transpose_y_0 = const()[name = string("mh_w_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_9_cast_fp16 = matmul(transpose_x = mh_w_9_transpose_x_0, transpose_y = mh_w_9_transpose_y_0, x = var_1252_cast_fp16, y = key_heads_11_cast_fp16)[name = string("mh_w_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_11_cast_fp16 = add(x = mh_w_9_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_11_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_1264_cast_fp16 = softmax(axis = var_936, x = mh_w_11_cast_fp16)[name = string("op_1264_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = value_heads_11_cast_fp16, y = var_1264_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1269 = const()[name = string("op_1269"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_17_cast_fp16 = reshape(shape = var_1269, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            string obj_27_pad_type_0 = const()[name = string("obj_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_27_strides_0 = const()[name = string("obj_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_27_pad_0 = const()[name = string("obj_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_27_dilations_0 = const()[name = string("obj_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_27_groups_0 = const()[name = string("obj_27_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_2_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109159040))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113353408))))[name = string("layers_2_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_27_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_27_dilations_0, groups = obj_27_groups_0, pad = obj_27_pad_0, pad_type = obj_27_pad_type_0, strides = obj_27_strides_0, weight = layers_2_self_attn_o_proj_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = string("obj_27_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_21_cast_fp16 = add(x = inputs_15_cast_fp16, y = obj_27_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1287_to_fp16 = const()[name = string("op_1287_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1288_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1287_to_fp16)[name = string("op_1288_cast_fp16")];
+            fp32 var_1289_epsilon_0 = const()[name = string("op_1289_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1289_cast_fp16 = rsqrt(epsilon = var_1289_epsilon_0, x = var_1288_cast_fp16)[name = string("op_1289_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_1289_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113353984)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_19_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_19_cast_fp16")];
+            string input_21_pad_type_0 = const()[name = string("input_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_21_strides_0 = const()[name = string("input_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_21_pad_0 = const()[name = string("input_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_21_dilations_0 = const()[name = string("input_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_21_groups_0 = const()[name = string("input_21_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113358144))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(125941120))))[name = string("layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_21_cast_fp16 = conv(dilations = input_21_dilations_0, groups = input_21_groups_0, pad = input_21_pad_0, pad_type = input_21_pad_type_0, strides = input_21_strides_0, weight = layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("input_21_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_1303_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_1303_cast_fp16")];
+            string var_1309_pad_type_0 = const()[name = string("op_1309_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1309_strides_0 = const()[name = string("op_1309_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1309_pad_0 = const()[name = string("op_1309_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1309_dilations_0 = const()[name = string("op_1309_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1309_groups_0 = const()[name = string("op_1309_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(125941696))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138524672))))[name = string("layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_1309_cast_fp16 = conv(dilations = var_1309_dilations_0, groups = var_1309_groups_0, pad = var_1309_pad_0, pad_type = var_1309_pad_type_0, strides = var_1309_strides_0, weight = layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("op_1309_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_23_cast_fp16 = mul(x = var_1303_cast_fp16, y = var_1309_cast_fp16)[name = string("input_23_cast_fp16")];
+            string hidden_states_29_pad_type_0 = const()[name = string("hidden_states_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_29_strides_0 = const()[name = string("hidden_states_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_29_pad_0 = const()[name = string("hidden_states_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_29_dilations_0 = const()[name = string("hidden_states_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_29_groups_0 = const()[name = string("hidden_states_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_2_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138525248))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151108224))))[name = string("layers_2_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_29_cast_fp16 = conv(dilations = hidden_states_29_dilations_0, groups = hidden_states_29_groups_0, pad = hidden_states_29_pad_0, pad_type = hidden_states_29_pad_type_0, strides = hidden_states_29_strides_0, weight = layers_2_mlp_down_proj_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_21_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            int32 var_1323 = const()[name = string("op_1323"), val = int32(3)];
+            int32 var_1333 = const()[name = string("op_1333"), val = int32(-2)];
+            int32 var_1341 = const()[name = string("op_1341"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1353_to_fp16 = const()[name = string("op_1353_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1354_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1353_to_fp16)[name = string("op_1354_cast_fp16")];
+            fp32 var_1355_epsilon_0 = const()[name = string("op_1355_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1355_cast_fp16 = rsqrt(epsilon = var_1355_epsilon_0, x = var_1354_cast_fp16)[name = string("op_1355_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1355_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151108800)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_29_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_31_cast_fp16)[name = string("obj_29_cast_fp16")];
+            string query_19_pad_type_0 = const()[name = string("query_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_19_strides_0 = const()[name = string("query_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_19_pad_0 = const()[name = string("query_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_19_dilations_0 = const()[name = string("query_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_19_groups_0 = const()[name = string("query_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151112960))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(155307328))))[name = string("layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_19_dilations_0, groups = query_19_groups_0, pad = query_19_pad_0, pad_type = query_19_pad_type_0, strides = query_19_strides_0, weight = layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("query_19_cast_fp16")];
+            string current_key_13_pad_type_0 = const()[name = string("current_key_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_13_strides_0 = const()[name = string("current_key_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_13_pad_0 = const()[name = string("current_key_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_13_dilations_0 = const()[name = string("current_key_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_13_groups_0 = const()[name = string("current_key_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(155307904))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157405120))))[name = string("layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = conv(dilations = current_key_13_dilations_0, groups = current_key_13_groups_0, pad = current_key_13_pad_0, pad_type = current_key_13_pad_type_0, strides = current_key_13_strides_0, weight = layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            string current_value_7_pad_type_0 = const()[name = string("current_value_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_7_strides_0 = const()[name = string("current_value_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_7_pad_0 = const()[name = string("current_value_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_7_dilations_0 = const()[name = string("current_value_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_7_groups_0 = const()[name = string("current_value_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(157405696))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(159502912))))[name = string("layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_7_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_7_dilations_0, groups = current_value_7_groups_0, pad = current_value_7_pad_0, pad_type = current_value_7_pad_type_0, strides = current_value_7_strides_0, weight = layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_value_7_cast_fp16")];
+            tensor<int32, [4]> var_1392 = const()[name = string("op_1392"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_25_cast_fp16 = reshape(shape = var_1392, x = query_19_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1398_to_fp16 = const()[name = string("op_1398_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1399_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1398_to_fp16)[name = string("op_1399_cast_fp16")];
+            fp32 var_1400_epsilon_0 = const()[name = string("op_1400_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1400_cast_fp16 = rsqrt(epsilon = var_1400_epsilon_0, x = var_1399_cast_fp16)[name = string("op_1400_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_33_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1400_cast_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(159503488)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_7_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_33_cast_fp16)[name = string("query_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1408 = const()[name = string("op_1408"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_27_cast_fp16 = reshape(shape = var_1408, x = current_key_13_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1414_to_fp16 = const()[name = string("op_1414_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1415_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1414_to_fp16)[name = string("op_1415_cast_fp16")];
+            fp32 var_1416_epsilon_0 = const()[name = string("op_1416_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1416_cast_fp16 = rsqrt(epsilon = var_1416_epsilon_0, x = var_1415_cast_fp16)[name = string("op_1416_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_35_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1416_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(159503808)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_7_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_35_cast_fp16)[name = string("current_key_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1434 = const()[name = string("op_1434"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_19_cast_fp16 = reshape(shape = var_1434, x = query_normed_7_cast_fp16)[name = string("mh_q_19_cast_fp16")];
+            tensor<int32, [4]> var_1436 = const()[name = string("op_1436"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_13_cast_fp16 = reshape(shape = var_1436, x = current_key_normed_7_cast_fp16)[name = string("mh_k_13_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1440_cast_fp16 = mul(x = mh_q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1440_cast_fp16")];
+            tensor<int32, [4]> var_1445_begin_0 = const()[name = string("op_1445_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1445_end_0 = const()[name = string("op_1445_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1445_end_mask_0 = const()[name = string("op_1445_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1445_cast_fp16 = slice_by_index(begin = var_1445_begin_0, end = var_1445_end_0, end_mask = var_1445_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1445_cast_fp16")];
+            tensor<int32, [4]> var_1451_begin_0 = const()[name = string("op_1451_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1451_end_0 = const()[name = string("op_1451_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1451_end_mask_0 = const()[name = string("op_1451_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1451_cast_fp16 = slice_by_index(begin = var_1451_begin_0, end = var_1451_end_0, end_mask = var_1451_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1451_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1453_cast_fp16 = mul(x = var_1451_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1453_cast_fp16")];
+            bool var_1455_interleave_0 = const()[name = string("op_1455_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1455_cast_fp16 = concat(axis = var_1333, interleave = var_1455_interleave_0, values = (var_1453_cast_fp16, var_1445_cast_fp16))[name = string("op_1455_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1456_cast_fp16 = mul(x = var_1455_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1456_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_21_cast_fp16 = add(x = var_1440_cast_fp16, y = var_1456_cast_fp16)[name = string("mh_q_21_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1458_cast_fp16 = mul(x = mh_k_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1458_cast_fp16")];
+            tensor<int32, [4]> var_1463_begin_0 = const()[name = string("op_1463_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1463_end_0 = const()[name = string("op_1463_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1463_end_mask_0 = const()[name = string("op_1463_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1463_cast_fp16 = slice_by_index(begin = var_1463_begin_0, end = var_1463_end_0, end_mask = var_1463_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1463_cast_fp16")];
+            tensor<int32, [4]> var_1469_begin_0 = const()[name = string("op_1469_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1469_end_0 = const()[name = string("op_1469_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1469_end_mask_0 = const()[name = string("op_1469_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1469_cast_fp16 = slice_by_index(begin = var_1469_begin_0, end = var_1469_end_0, end_mask = var_1469_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1469_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1471_cast_fp16 = mul(x = var_1469_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1471_cast_fp16")];
+            bool var_1473_interleave_0 = const()[name = string("op_1473_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1473_cast_fp16 = concat(axis = var_1333, interleave = var_1473_interleave_0, values = (var_1471_cast_fp16, var_1463_cast_fp16))[name = string("op_1473_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1474_cast_fp16 = mul(x = var_1473_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1474_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_15_cast_fp16 = add(x = var_1458_cast_fp16, y = var_1474_cast_fp16)[name = string("mh_k_15_cast_fp16")];
+            tensor<int32, [4]> var_1478 = const()[name = string("op_1478"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_15_cast_fp16 = reshape(shape = var_1478, x = mh_k_15_cast_fp16)[name = string("current_key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1485_cast_fp16 = mul(x = var_101_cast_fp16_3, y = var_323_cast_fp16)[name = string("op_1485_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1486_cast_fp16 = mul(x = current_key_15_cast_fp16, y = var_321_cast_fp16)[name = string("op_1486_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_21_cast_fp16 = add(x = var_1485_cast_fp16, y = var_1486_cast_fp16)[name = string("key_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1489_cast_fp16 = mul(x = var_132_cast_fp16_3, y = var_323_cast_fp16)[name = string("op_1489_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1490_cast_fp16 = mul(x = current_value_7_cast_fp16, y = var_321_cast_fp16)[name = string("op_1490_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_13_cast_fp16 = add(x = var_1489_cast_fp16, y = var_1490_cast_fp16)[name = string("value_13_cast_fp16")];
+            tensor<int32, [4]> var_1494 = const()[name = string("op_1494"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_13_cast_fp16 = reshape(shape = var_1494, x = key_21_cast_fp16)[name = string("key_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1496 = const()[name = string("op_1496"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_13_cast_fp16 = reshape(shape = var_1496, x = value_13_cast_fp16)[name = string("value_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1499_begin_0 = const()[name = string("op_1499_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1499_end_0 = const()[name = string("op_1499_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1499_end_mask_0 = const()[name = string("op_1499_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1499_cast_fp16 = slice_by_index(begin = var_1499_begin_0, end = var_1499_end_0, end_mask = var_1499_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1499_cast_fp16")];
+            tensor<int32, [4]> var_1503_begin_0 = const()[name = string("op_1503_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1503_end_0 = const()[name = string("op_1503_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1503_end_mask_0 = const()[name = string("op_1503_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1503_cast_fp16 = slice_by_index(begin = var_1503_begin_0, end = var_1503_end_0, end_mask = var_1503_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1503_cast_fp16")];
+            tensor<int32, [4]> var_1515_begin_0 = const()[name = string("op_1515_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1515_end_0 = const()[name = string("op_1515_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1515_end_mask_0 = const()[name = string("op_1515_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1515_cast_fp16 = slice_by_index(begin = var_1515_begin_0, end = var_1515_end_0, end_mask = var_1515_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1515_cast_fp16")];
+            tensor<int32, [4]> var_1519_begin_0 = const()[name = string("op_1519_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1519_end_0 = const()[name = string("op_1519_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1519_end_mask_0 = const()[name = string("op_1519_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1519_cast_fp16 = slice_by_index(begin = var_1519_begin_0, end = var_1519_end_0, end_mask = var_1519_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1519_cast_fp16")];
+            tensor<int32, [4]> var_1531_begin_0 = const()[name = string("op_1531_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1531_end_0 = const()[name = string("op_1531_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1531_end_mask_0 = const()[name = string("op_1531_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1531_cast_fp16 = slice_by_index(begin = var_1531_begin_0, end = var_1531_end_0, end_mask = var_1531_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1531_cast_fp16")];
+            tensor<int32, [4]> var_1535_begin_0 = const()[name = string("op_1535_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1535_end_0 = const()[name = string("op_1535_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1535_end_mask_0 = const()[name = string("op_1535_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1535_cast_fp16 = slice_by_index(begin = var_1535_begin_0, end = var_1535_end_0, end_mask = var_1535_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1535_cast_fp16")];
+            tensor<int32, [4]> var_1547_begin_0 = const()[name = string("op_1547_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1547_end_0 = const()[name = string("op_1547_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1547_end_mask_0 = const()[name = string("op_1547_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1547_cast_fp16 = slice_by_index(begin = var_1547_begin_0, end = var_1547_end_0, end_mask = var_1547_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1547_cast_fp16")];
+            tensor<int32, [4]> var_1551_begin_0 = const()[name = string("op_1551_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1551_end_0 = const()[name = string("op_1551_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1551_end_mask_0 = const()[name = string("op_1551_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1551_cast_fp16 = slice_by_index(begin = var_1551_begin_0, end = var_1551_end_0, end_mask = var_1551_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1551_cast_fp16")];
+            tensor<int32, [4]> var_1563_begin_0 = const()[name = string("op_1563_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1563_end_0 = const()[name = string("op_1563_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1563_end_mask_0 = const()[name = string("op_1563_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1563_cast_fp16 = slice_by_index(begin = var_1563_begin_0, end = var_1563_end_0, end_mask = var_1563_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1563_cast_fp16")];
+            tensor<int32, [4]> var_1567_begin_0 = const()[name = string("op_1567_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1567_end_0 = const()[name = string("op_1567_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1567_end_mask_0 = const()[name = string("op_1567_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1567_cast_fp16 = slice_by_index(begin = var_1567_begin_0, end = var_1567_end_0, end_mask = var_1567_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1567_cast_fp16")];
+            tensor<int32, [4]> var_1579_begin_0 = const()[name = string("op_1579_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1579_end_0 = const()[name = string("op_1579_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1579_end_mask_0 = const()[name = string("op_1579_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1579_cast_fp16 = slice_by_index(begin = var_1579_begin_0, end = var_1579_end_0, end_mask = var_1579_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1579_cast_fp16")];
+            tensor<int32, [4]> var_1583_begin_0 = const()[name = string("op_1583_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1583_end_0 = const()[name = string("op_1583_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1583_end_mask_0 = const()[name = string("op_1583_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1583_cast_fp16 = slice_by_index(begin = var_1583_begin_0, end = var_1583_end_0, end_mask = var_1583_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1583_cast_fp16")];
+            tensor<int32, [4]> var_1595_begin_0 = const()[name = string("op_1595_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1595_end_0 = const()[name = string("op_1595_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1595_end_mask_0 = const()[name = string("op_1595_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1595_cast_fp16 = slice_by_index(begin = var_1595_begin_0, end = var_1595_end_0, end_mask = var_1595_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1595_cast_fp16")];
+            tensor<int32, [4]> var_1599_begin_0 = const()[name = string("op_1599_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1599_end_0 = const()[name = string("op_1599_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1599_end_mask_0 = const()[name = string("op_1599_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1599_cast_fp16 = slice_by_index(begin = var_1599_begin_0, end = var_1599_end_0, end_mask = var_1599_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1599_cast_fp16")];
+            tensor<int32, [4]> var_1611_begin_0 = const()[name = string("op_1611_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1611_end_0 = const()[name = string("op_1611_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1611_end_mask_0 = const()[name = string("op_1611_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1611_cast_fp16 = slice_by_index(begin = var_1611_begin_0, end = var_1611_end_0, end_mask = var_1611_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1611_cast_fp16")];
+            tensor<int32, [4]> var_1615_begin_0 = const()[name = string("op_1615_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1615_end_0 = const()[name = string("op_1615_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1615_end_mask_0 = const()[name = string("op_1615_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1615_cast_fp16 = slice_by_index(begin = var_1615_begin_0, end = var_1615_end_0, end_mask = var_1615_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1615_cast_fp16")];
+            bool key_heads_15_interleave_0 = const()[name = string("key_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_15_cast_fp16 = concat(axis = var_1341, interleave = key_heads_15_interleave_0, values = (var_1499_cast_fp16, var_1499_cast_fp16, var_1515_cast_fp16, var_1515_cast_fp16, var_1531_cast_fp16, var_1531_cast_fp16, var_1547_cast_fp16, var_1547_cast_fp16, var_1563_cast_fp16, var_1563_cast_fp16, var_1579_cast_fp16, var_1579_cast_fp16, var_1595_cast_fp16, var_1595_cast_fp16, var_1611_cast_fp16, var_1611_cast_fp16))[name = string("key_heads_15_cast_fp16")];
+            bool value_heads_15_interleave_0 = const()[name = string("value_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_15_cast_fp16 = concat(axis = var_1341, interleave = value_heads_15_interleave_0, values = (var_1503_cast_fp16, var_1503_cast_fp16, var_1519_cast_fp16, var_1519_cast_fp16, var_1535_cast_fp16, var_1535_cast_fp16, var_1551_cast_fp16, var_1551_cast_fp16, var_1567_cast_fp16, var_1567_cast_fp16, var_1583_cast_fp16, var_1583_cast_fp16, var_1599_cast_fp16, var_1599_cast_fp16, var_1615_cast_fp16, var_1615_cast_fp16))[name = string("value_heads_15_cast_fp16")];
+            fp16 var_1638_to_fp16 = const()[name = string("op_1638_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1639_cast_fp16 = mul(x = mh_q_21_cast_fp16, y = var_1638_to_fp16)[name = string("op_1639_cast_fp16")];
+            bool mh_w_13_transpose_x_0 = const()[name = string("mh_w_13_transpose_x_0"), val = bool(true)];
+            bool mh_w_13_transpose_y_0 = const()[name = string("mh_w_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_13_cast_fp16 = matmul(transpose_x = mh_w_13_transpose_x_0, transpose_y = mh_w_13_transpose_y_0, x = var_1639_cast_fp16, y = key_heads_15_cast_fp16)[name = string("mh_w_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_15_cast_fp16 = add(x = mh_w_13_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_15_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_1651_cast_fp16 = softmax(axis = var_1323, x = mh_w_15_cast_fp16)[name = string("op_1651_cast_fp16")];
+            bool attn_7_transpose_x_0 = const()[name = string("attn_7_transpose_x_0"), val = bool(false)];
+            bool attn_7_transpose_y_0 = const()[name = string("attn_7_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_7_cast_fp16 = matmul(transpose_x = attn_7_transpose_x_0, transpose_y = attn_7_transpose_y_0, x = value_heads_15_cast_fp16, y = var_1651_cast_fp16)[name = string("attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1656 = const()[name = string("op_1656"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_25_cast_fp16 = reshape(shape = var_1656, x = attn_7_cast_fp16)[name = string("input_25_cast_fp16")];
+            string obj_35_pad_type_0 = const()[name = string("obj_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_35_strides_0 = const()[name = string("obj_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_35_pad_0 = const()[name = string("obj_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_35_dilations_0 = const()[name = string("obj_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_35_groups_0 = const()[name = string("obj_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_3_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(159504128))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163698496))))[name = string("layers_3_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_35_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_35_dilations_0, groups = obj_35_groups_0, pad = obj_35_pad_0, pad_type = obj_35_pad_type_0, strides = obj_35_strides_0, weight = layers_3_self_attn_o_proj_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = string("obj_35_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_29_cast_fp16 = add(x = inputs_23_cast_fp16, y = obj_35_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_1674_to_fp16 = const()[name = string("op_1674_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1675_cast_fp16 = add(x = variance_31_cast_fp16, y = var_1674_to_fp16)[name = string("op_1675_cast_fp16")];
+            fp32 var_1676_epsilon_0 = const()[name = string("op_1676_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1676_cast_fp16 = rsqrt(epsilon = var_1676_epsilon_0, x = var_1675_cast_fp16)[name = string("op_1676_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_37_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1676_cast_fp16)[name = string("hidden_states_37_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163699072)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_27_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_37_cast_fp16)[name = string("input_27_cast_fp16")];
+            string input_29_pad_type_0 = const()[name = string("input_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_29_strides_0 = const()[name = string("input_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_29_pad_0 = const()[name = string("input_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_29_dilations_0 = const()[name = string("input_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_29_groups_0 = const()[name = string("input_29_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163703232))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(176286208))))[name = string("layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_29_cast_fp16 = conv(dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("input_29_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_1690_cast_fp16 = silu(x = input_29_cast_fp16)[name = string("op_1690_cast_fp16")];
+            string var_1696_pad_type_0 = const()[name = string("op_1696_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1696_strides_0 = const()[name = string("op_1696_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1696_pad_0 = const()[name = string("op_1696_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1696_dilations_0 = const()[name = string("op_1696_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1696_groups_0 = const()[name = string("op_1696_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(176286784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188869760))))[name = string("layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_1696_cast_fp16 = conv(dilations = var_1696_dilations_0, groups = var_1696_groups_0, pad = var_1696_pad_0, pad_type = var_1696_pad_type_0, strides = var_1696_strides_0, weight = layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("op_1696_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_31_cast_fp16 = mul(x = var_1690_cast_fp16, y = var_1696_cast_fp16)[name = string("input_31_cast_fp16")];
+            string hidden_states_39_pad_type_0 = const()[name = string("hidden_states_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_39_strides_0 = const()[name = string("hidden_states_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_39_pad_0 = const()[name = string("hidden_states_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_39_dilations_0 = const()[name = string("hidden_states_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_39_groups_0 = const()[name = string("hidden_states_39_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_3_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188870336))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(201453312))))[name = string("layers_3_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_39_cast_fp16 = conv(dilations = hidden_states_39_dilations_0, groups = hidden_states_39_groups_0, pad = hidden_states_39_pad_0, pad_type = hidden_states_39_pad_type_0, strides = hidden_states_39_strides_0, weight = layers_3_mlp_down_proj_weight_to_fp16_palettized, x = input_31_cast_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_29_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            int32 var_1710 = const()[name = string("op_1710"), val = int32(3)];
+            int32 var_1720 = const()[name = string("op_1720"), val = int32(-2)];
+            int32 var_1728 = const()[name = string("op_1728"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_33_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_33_cast_fp16")];
+            tensor<int32, [1]> variance_33_axes_0 = const()[name = string("variance_33_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_33_keep_dims_0 = const()[name = string("variance_33_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_33_cast_fp16 = reduce_mean(axes = variance_33_axes_0, keep_dims = variance_33_keep_dims_0, x = inputs_sq_33_cast_fp16)[name = string("variance_33_cast_fp16")];
+            fp16 var_1740_to_fp16 = const()[name = string("op_1740_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1741_cast_fp16 = add(x = variance_33_cast_fp16, y = var_1740_to_fp16)[name = string("op_1741_cast_fp16")];
+            fp32 var_1742_epsilon_0 = const()[name = string("op_1742_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1742_cast_fp16 = rsqrt(epsilon = var_1742_epsilon_0, x = var_1741_cast_fp16)[name = string("op_1742_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_41_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_1742_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_33_to_fp16 = const()[name = string("w_33_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(201453888)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_37_cast_fp16 = mul(x = w_33_to_fp16, y = hidden_states_41_cast_fp16)[name = string("obj_37_cast_fp16")];
+            string query_25_pad_type_0 = const()[name = string("query_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_25_strides_0 = const()[name = string("query_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_25_pad_0 = const()[name = string("query_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_25_dilations_0 = const()[name = string("query_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_25_groups_0 = const()[name = string("query_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(201458048))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(205652416))))[name = string("layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_25_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_25_dilations_0, groups = query_25_groups_0, pad = query_25_pad_0, pad_type = query_25_pad_type_0, strides = query_25_strides_0, weight = layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("query_25_cast_fp16")];
+            string current_key_17_pad_type_0 = const()[name = string("current_key_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_17_strides_0 = const()[name = string("current_key_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_17_pad_0 = const()[name = string("current_key_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_17_dilations_0 = const()[name = string("current_key_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_17_groups_0 = const()[name = string("current_key_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(205652992))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(207750208))))[name = string("layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_17_cast_fp16 = conv(dilations = current_key_17_dilations_0, groups = current_key_17_groups_0, pad = current_key_17_pad_0, pad_type = current_key_17_pad_type_0, strides = current_key_17_strides_0, weight = layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_key_17_cast_fp16")];
+            string current_value_9_pad_type_0 = const()[name = string("current_value_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_9_strides_0 = const()[name = string("current_value_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_9_pad_0 = const()[name = string("current_value_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_9_dilations_0 = const()[name = string("current_value_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_9_groups_0 = const()[name = string("current_value_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(207750784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(209848000))))[name = string("layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_9_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_9_dilations_0, groups = current_value_9_groups_0, pad = current_value_9_pad_0, pad_type = current_value_9_pad_type_0, strides = current_value_9_strides_0, weight = layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_value_9_cast_fp16")];
+            tensor<int32, [4]> var_1779 = const()[name = string("op_1779"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_33_cast_fp16 = reshape(shape = var_1779, x = query_25_cast_fp16)[name = string("inputs_33_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_35_cast_fp16 = mul(x = inputs_33_cast_fp16, y = inputs_33_cast_fp16)[name = string("inputs_sq_35_cast_fp16")];
+            tensor<int32, [1]> variance_35_axes_0 = const()[name = string("variance_35_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_35_keep_dims_0 = const()[name = string("variance_35_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_35_cast_fp16 = reduce_mean(axes = variance_35_axes_0, keep_dims = variance_35_keep_dims_0, x = inputs_sq_35_cast_fp16)[name = string("variance_35_cast_fp16")];
+            fp16 var_1785_to_fp16 = const()[name = string("op_1785_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1786_cast_fp16 = add(x = variance_35_cast_fp16, y = var_1785_to_fp16)[name = string("op_1786_cast_fp16")];
+            fp32 var_1787_epsilon_0 = const()[name = string("op_1787_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1787_cast_fp16 = rsqrt(epsilon = var_1787_epsilon_0, x = var_1786_cast_fp16)[name = string("op_1787_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_43_cast_fp16 = mul(x = inputs_33_cast_fp16, y = var_1787_cast_fp16)[name = string("hidden_states_43_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_35_to_fp16 = const()[name = string("w_35_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(209848576)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_9_cast_fp16 = mul(x = w_35_to_fp16, y = hidden_states_43_cast_fp16)[name = string("query_normed_9_cast_fp16")];
+            tensor<int32, [4]> var_1795 = const()[name = string("op_1795"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_35_cast_fp16 = reshape(shape = var_1795, x = current_key_17_cast_fp16)[name = string("inputs_35_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_37_cast_fp16 = mul(x = inputs_35_cast_fp16, y = inputs_35_cast_fp16)[name = string("inputs_sq_37_cast_fp16")];
+            tensor<int32, [1]> variance_37_axes_0 = const()[name = string("variance_37_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_37_keep_dims_0 = const()[name = string("variance_37_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_37_cast_fp16 = reduce_mean(axes = variance_37_axes_0, keep_dims = variance_37_keep_dims_0, x = inputs_sq_37_cast_fp16)[name = string("variance_37_cast_fp16")];
+            fp16 var_1801_to_fp16 = const()[name = string("op_1801_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1802_cast_fp16 = add(x = variance_37_cast_fp16, y = var_1801_to_fp16)[name = string("op_1802_cast_fp16")];
+            fp32 var_1803_epsilon_0 = const()[name = string("op_1803_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1803_cast_fp16 = rsqrt(epsilon = var_1803_epsilon_0, x = var_1802_cast_fp16)[name = string("op_1803_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_45_cast_fp16 = mul(x = inputs_35_cast_fp16, y = var_1803_cast_fp16)[name = string("hidden_states_45_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_37_to_fp16 = const()[name = string("w_37_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(209848896)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_9_cast_fp16 = mul(x = w_37_to_fp16, y = hidden_states_45_cast_fp16)[name = string("current_key_normed_9_cast_fp16")];
+            tensor<int32, [4]> var_1821 = const()[name = string("op_1821"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_25_cast_fp16 = reshape(shape = var_1821, x = query_normed_9_cast_fp16)[name = string("mh_q_25_cast_fp16")];
+            tensor<int32, [4]> var_1823 = const()[name = string("op_1823"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_17_cast_fp16 = reshape(shape = var_1823, x = current_key_normed_9_cast_fp16)[name = string("mh_k_17_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1827_cast_fp16 = mul(x = mh_q_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1827_cast_fp16")];
+            tensor<int32, [4]> var_1832_begin_0 = const()[name = string("op_1832_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1832_end_0 = const()[name = string("op_1832_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1832_end_mask_0 = const()[name = string("op_1832_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1832_cast_fp16 = slice_by_index(begin = var_1832_begin_0, end = var_1832_end_0, end_mask = var_1832_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1832_cast_fp16")];
+            tensor<int32, [4]> var_1838_begin_0 = const()[name = string("op_1838_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1838_end_0 = const()[name = string("op_1838_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1838_end_mask_0 = const()[name = string("op_1838_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1838_cast_fp16 = slice_by_index(begin = var_1838_begin_0, end = var_1838_end_0, end_mask = var_1838_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1838_cast_fp16")];
+            fp16 const_109_promoted_to_fp16 = const()[name = string("const_109_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1840_cast_fp16 = mul(x = var_1838_cast_fp16, y = const_109_promoted_to_fp16)[name = string("op_1840_cast_fp16")];
+            bool var_1842_interleave_0 = const()[name = string("op_1842_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1842_cast_fp16 = concat(axis = var_1720, interleave = var_1842_interleave_0, values = (var_1840_cast_fp16, var_1832_cast_fp16))[name = string("op_1842_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1843_cast_fp16 = mul(x = var_1842_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1843_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_27_cast_fp16 = add(x = var_1827_cast_fp16, y = var_1843_cast_fp16)[name = string("mh_q_27_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1845_cast_fp16 = mul(x = mh_k_17_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1845_cast_fp16")];
+            tensor<int32, [4]> var_1850_begin_0 = const()[name = string("op_1850_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1850_end_0 = const()[name = string("op_1850_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1850_end_mask_0 = const()[name = string("op_1850_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1850_cast_fp16 = slice_by_index(begin = var_1850_begin_0, end = var_1850_end_0, end_mask = var_1850_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1850_cast_fp16")];
+            tensor<int32, [4]> var_1856_begin_0 = const()[name = string("op_1856_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1856_end_0 = const()[name = string("op_1856_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1856_end_mask_0 = const()[name = string("op_1856_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1856_cast_fp16 = slice_by_index(begin = var_1856_begin_0, end = var_1856_end_0, end_mask = var_1856_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1856_cast_fp16")];
+            fp16 const_112_promoted_to_fp16 = const()[name = string("const_112_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1858_cast_fp16 = mul(x = var_1856_cast_fp16, y = const_112_promoted_to_fp16)[name = string("op_1858_cast_fp16")];
+            bool var_1860_interleave_0 = const()[name = string("op_1860_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1860_cast_fp16 = concat(axis = var_1720, interleave = var_1860_interleave_0, values = (var_1858_cast_fp16, var_1850_cast_fp16))[name = string("op_1860_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1861_cast_fp16 = mul(x = var_1860_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1861_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_19_cast_fp16 = add(x = var_1845_cast_fp16, y = var_1861_cast_fp16)[name = string("mh_k_19_cast_fp16")];
+            tensor<int32, [4]> var_1865 = const()[name = string("op_1865"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_19_cast_fp16 = reshape(shape = var_1865, x = mh_k_19_cast_fp16)[name = string("current_key_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1872_cast_fp16 = mul(x = var_101_cast_fp16_4, y = var_323_cast_fp16)[name = string("op_1872_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1873_cast_fp16 = mul(x = current_key_19_cast_fp16, y = var_321_cast_fp16)[name = string("op_1873_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_27_cast_fp16 = add(x = var_1872_cast_fp16, y = var_1873_cast_fp16)[name = string("key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1876_cast_fp16 = mul(x = var_132_cast_fp16_4, y = var_323_cast_fp16)[name = string("op_1876_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1877_cast_fp16 = mul(x = current_value_9_cast_fp16, y = var_321_cast_fp16)[name = string("op_1877_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_17_cast_fp16 = add(x = var_1876_cast_fp16, y = var_1877_cast_fp16)[name = string("value_17_cast_fp16")];
+            tensor<int32, [4]> var_1881 = const()[name = string("op_1881"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_17_cast_fp16 = reshape(shape = var_1881, x = key_27_cast_fp16)[name = string("key_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1883 = const()[name = string("op_1883"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_17_cast_fp16 = reshape(shape = var_1883, x = value_17_cast_fp16)[name = string("value_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1886_begin_0 = const()[name = string("op_1886_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1886_end_0 = const()[name = string("op_1886_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1886_end_mask_0 = const()[name = string("op_1886_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1886_cast_fp16 = slice_by_index(begin = var_1886_begin_0, end = var_1886_end_0, end_mask = var_1886_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1886_cast_fp16")];
+            tensor<int32, [4]> var_1890_begin_0 = const()[name = string("op_1890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1890_end_0 = const()[name = string("op_1890_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1890_end_mask_0 = const()[name = string("op_1890_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1890_cast_fp16 = slice_by_index(begin = var_1890_begin_0, end = var_1890_end_0, end_mask = var_1890_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1890_cast_fp16")];
+            tensor<int32, [4]> var_1902_begin_0 = const()[name = string("op_1902_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1902_end_0 = const()[name = string("op_1902_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1902_end_mask_0 = const()[name = string("op_1902_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1902_cast_fp16 = slice_by_index(begin = var_1902_begin_0, end = var_1902_end_0, end_mask = var_1902_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1902_cast_fp16")];
+            tensor<int32, [4]> var_1906_begin_0 = const()[name = string("op_1906_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1906_end_0 = const()[name = string("op_1906_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_1906_end_mask_0 = const()[name = string("op_1906_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1906_cast_fp16 = slice_by_index(begin = var_1906_begin_0, end = var_1906_end_0, end_mask = var_1906_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1906_cast_fp16")];
+            tensor<int32, [4]> var_1918_begin_0 = const()[name = string("op_1918_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1918_end_0 = const()[name = string("op_1918_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1918_end_mask_0 = const()[name = string("op_1918_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1918_cast_fp16 = slice_by_index(begin = var_1918_begin_0, end = var_1918_end_0, end_mask = var_1918_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1918_cast_fp16")];
+            tensor<int32, [4]> var_1922_begin_0 = const()[name = string("op_1922_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1922_end_0 = const()[name = string("op_1922_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_1922_end_mask_0 = const()[name = string("op_1922_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1922_cast_fp16 = slice_by_index(begin = var_1922_begin_0, end = var_1922_end_0, end_mask = var_1922_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1922_cast_fp16")];
+            tensor<int32, [4]> var_1934_begin_0 = const()[name = string("op_1934_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1934_end_0 = const()[name = string("op_1934_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1934_end_mask_0 = const()[name = string("op_1934_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1934_cast_fp16 = slice_by_index(begin = var_1934_begin_0, end = var_1934_end_0, end_mask = var_1934_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1934_cast_fp16")];
+            tensor<int32, [4]> var_1938_begin_0 = const()[name = string("op_1938_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1938_end_0 = const()[name = string("op_1938_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_1938_end_mask_0 = const()[name = string("op_1938_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1938_cast_fp16 = slice_by_index(begin = var_1938_begin_0, end = var_1938_end_0, end_mask = var_1938_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1938_cast_fp16")];
+            tensor<int32, [4]> var_1950_begin_0 = const()[name = string("op_1950_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1950_end_0 = const()[name = string("op_1950_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1950_end_mask_0 = const()[name = string("op_1950_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1950_cast_fp16 = slice_by_index(begin = var_1950_begin_0, end = var_1950_end_0, end_mask = var_1950_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1950_cast_fp16")];
+            tensor<int32, [4]> var_1954_begin_0 = const()[name = string("op_1954_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1954_end_0 = const()[name = string("op_1954_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_1954_end_mask_0 = const()[name = string("op_1954_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1954_cast_fp16 = slice_by_index(begin = var_1954_begin_0, end = var_1954_end_0, end_mask = var_1954_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1954_cast_fp16")];
+            tensor<int32, [4]> var_1966_begin_0 = const()[name = string("op_1966_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1966_end_0 = const()[name = string("op_1966_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1966_end_mask_0 = const()[name = string("op_1966_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1966_cast_fp16 = slice_by_index(begin = var_1966_begin_0, end = var_1966_end_0, end_mask = var_1966_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1966_cast_fp16")];
+            tensor<int32, [4]> var_1970_begin_0 = const()[name = string("op_1970_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1970_end_0 = const()[name = string("op_1970_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_1970_end_mask_0 = const()[name = string("op_1970_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1970_cast_fp16 = slice_by_index(begin = var_1970_begin_0, end = var_1970_end_0, end_mask = var_1970_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1970_cast_fp16")];
+            tensor<int32, [4]> var_1982_begin_0 = const()[name = string("op_1982_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1982_end_0 = const()[name = string("op_1982_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1982_end_mask_0 = const()[name = string("op_1982_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1982_cast_fp16 = slice_by_index(begin = var_1982_begin_0, end = var_1982_end_0, end_mask = var_1982_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1982_cast_fp16")];
+            tensor<int32, [4]> var_1986_begin_0 = const()[name = string("op_1986_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1986_end_0 = const()[name = string("op_1986_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_1986_end_mask_0 = const()[name = string("op_1986_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1986_cast_fp16 = slice_by_index(begin = var_1986_begin_0, end = var_1986_end_0, end_mask = var_1986_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1986_cast_fp16")];
+            tensor<int32, [4]> var_1998_begin_0 = const()[name = string("op_1998_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1998_end_0 = const()[name = string("op_1998_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_1998_end_mask_0 = const()[name = string("op_1998_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_1998_cast_fp16 = slice_by_index(begin = var_1998_begin_0, end = var_1998_end_0, end_mask = var_1998_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1998_cast_fp16")];
+            tensor<int32, [4]> var_2002_begin_0 = const()[name = string("op_2002_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2002_end_0 = const()[name = string("op_2002_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2002_end_mask_0 = const()[name = string("op_2002_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2002_cast_fp16 = slice_by_index(begin = var_2002_begin_0, end = var_2002_end_0, end_mask = var_2002_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_2002_cast_fp16")];
+            bool key_heads_19_interleave_0 = const()[name = string("key_heads_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_19_cast_fp16 = concat(axis = var_1728, interleave = key_heads_19_interleave_0, values = (var_1886_cast_fp16, var_1886_cast_fp16, var_1902_cast_fp16, var_1902_cast_fp16, var_1918_cast_fp16, var_1918_cast_fp16, var_1934_cast_fp16, var_1934_cast_fp16, var_1950_cast_fp16, var_1950_cast_fp16, var_1966_cast_fp16, var_1966_cast_fp16, var_1982_cast_fp16, var_1982_cast_fp16, var_1998_cast_fp16, var_1998_cast_fp16))[name = string("key_heads_19_cast_fp16")];
+            bool value_heads_19_interleave_0 = const()[name = string("value_heads_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_19_cast_fp16 = concat(axis = var_1728, interleave = value_heads_19_interleave_0, values = (var_1890_cast_fp16, var_1890_cast_fp16, var_1906_cast_fp16, var_1906_cast_fp16, var_1922_cast_fp16, var_1922_cast_fp16, var_1938_cast_fp16, var_1938_cast_fp16, var_1954_cast_fp16, var_1954_cast_fp16, var_1970_cast_fp16, var_1970_cast_fp16, var_1986_cast_fp16, var_1986_cast_fp16, var_2002_cast_fp16, var_2002_cast_fp16))[name = string("value_heads_19_cast_fp16")];
+            fp16 var_2025_to_fp16 = const()[name = string("op_2025_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_2026_cast_fp16 = mul(x = mh_q_27_cast_fp16, y = var_2025_to_fp16)[name = string("op_2026_cast_fp16")];
+            bool mh_w_17_transpose_x_0 = const()[name = string("mh_w_17_transpose_x_0"), val = bool(true)];
+            bool mh_w_17_transpose_y_0 = const()[name = string("mh_w_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_17_cast_fp16 = matmul(transpose_x = mh_w_17_transpose_x_0, transpose_y = mh_w_17_transpose_y_0, x = var_2026_cast_fp16, y = key_heads_19_cast_fp16)[name = string("mh_w_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_19_cast_fp16 = add(x = mh_w_17_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_19_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_2038_cast_fp16 = softmax(axis = var_1710, x = mh_w_19_cast_fp16)[name = string("op_2038_cast_fp16")];
+            bool attn_9_transpose_x_0 = const()[name = string("attn_9_transpose_x_0"), val = bool(false)];
+            bool attn_9_transpose_y_0 = const()[name = string("attn_9_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_9_cast_fp16 = matmul(transpose_x = attn_9_transpose_x_0, transpose_y = attn_9_transpose_y_0, x = value_heads_19_cast_fp16, y = var_2038_cast_fp16)[name = string("attn_9_cast_fp16")];
+            tensor<int32, [4]> var_2043 = const()[name = string("op_2043"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_33_cast_fp16 = reshape(shape = var_2043, x = attn_9_cast_fp16)[name = string("input_33_cast_fp16")];
+            string obj_43_pad_type_0 = const()[name = string("obj_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_43_strides_0 = const()[name = string("obj_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_43_pad_0 = const()[name = string("obj_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_43_dilations_0 = const()[name = string("obj_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_43_groups_0 = const()[name = string("obj_43_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_4_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(209849216))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214043584))))[name = string("layers_4_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_43_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_43_dilations_0, groups = obj_43_groups_0, pad = obj_43_pad_0, pad_type = obj_43_pad_type_0, strides = obj_43_strides_0, weight = layers_4_self_attn_o_proj_weight_to_fp16_palettized, x = input_33_cast_fp16)[name = string("obj_43_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_37_cast_fp16 = add(x = inputs_31_cast_fp16, y = obj_43_cast_fp16)[name = string("inputs_37_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_39_cast_fp16 = mul(x = inputs_37_cast_fp16, y = inputs_37_cast_fp16)[name = string("inputs_sq_39_cast_fp16")];
+            tensor<int32, [1]> variance_39_axes_0 = const()[name = string("variance_39_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_39_keep_dims_0 = const()[name = string("variance_39_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_39_cast_fp16 = reduce_mean(axes = variance_39_axes_0, keep_dims = variance_39_keep_dims_0, x = inputs_sq_39_cast_fp16)[name = string("variance_39_cast_fp16")];
+            fp16 var_2061_to_fp16 = const()[name = string("op_2061_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2062_cast_fp16 = add(x = variance_39_cast_fp16, y = var_2061_to_fp16)[name = string("op_2062_cast_fp16")];
+            fp32 var_2063_epsilon_0 = const()[name = string("op_2063_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2063_cast_fp16 = rsqrt(epsilon = var_2063_epsilon_0, x = var_2062_cast_fp16)[name = string("op_2063_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_47_cast_fp16 = mul(x = inputs_37_cast_fp16, y = var_2063_cast_fp16)[name = string("hidden_states_47_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_39_to_fp16 = const()[name = string("w_39_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214044160)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_35_cast_fp16 = mul(x = w_39_to_fp16, y = hidden_states_47_cast_fp16)[name = string("input_35_cast_fp16")];
+            string input_37_pad_type_0 = const()[name = string("input_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_37_strides_0 = const()[name = string("input_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_37_dilations_0 = const()[name = string("input_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_37_groups_0 = const()[name = string("input_37_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214048320))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226631296))))[name = string("layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_37_cast_fp16 = conv(dilations = input_37_dilations_0, groups = input_37_groups_0, pad = input_37_pad_0, pad_type = input_37_pad_type_0, strides = input_37_strides_0, weight = layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("input_37_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_2077_cast_fp16 = silu(x = input_37_cast_fp16)[name = string("op_2077_cast_fp16")];
+            string var_2083_pad_type_0 = const()[name = string("op_2083_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2083_strides_0 = const()[name = string("op_2083_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2083_pad_0 = const()[name = string("op_2083_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2083_dilations_0 = const()[name = string("op_2083_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2083_groups_0 = const()[name = string("op_2083_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226631872))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239214848))))[name = string("layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_2083_cast_fp16 = conv(dilations = var_2083_dilations_0, groups = var_2083_groups_0, pad = var_2083_pad_0, pad_type = var_2083_pad_type_0, strides = var_2083_strides_0, weight = layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_2083_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_39_cast_fp16 = mul(x = var_2077_cast_fp16, y = var_2083_cast_fp16)[name = string("input_39_cast_fp16")];
+            string hidden_states_49_pad_type_0 = const()[name = string("hidden_states_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_49_strides_0 = const()[name = string("hidden_states_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_49_pad_0 = const()[name = string("hidden_states_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_49_dilations_0 = const()[name = string("hidden_states_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_49_groups_0 = const()[name = string("hidden_states_49_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_4_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239215424))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(251798400))))[name = string("layers_4_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_49_cast_fp16 = conv(dilations = hidden_states_49_dilations_0, groups = hidden_states_49_groups_0, pad = hidden_states_49_pad_0, pad_type = hidden_states_49_pad_type_0, strides = hidden_states_49_strides_0, weight = layers_4_mlp_down_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_39_cast_fp16 = add(x = inputs_37_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("inputs_39_cast_fp16")];
+            int32 var_2097 = const()[name = string("op_2097"), val = int32(3)];
+            int32 var_2107 = const()[name = string("op_2107"), val = int32(-2)];
+            int32 var_2115 = const()[name = string("op_2115"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_41_cast_fp16 = mul(x = inputs_39_cast_fp16, y = inputs_39_cast_fp16)[name = string("inputs_sq_41_cast_fp16")];
+            tensor<int32, [1]> variance_41_axes_0 = const()[name = string("variance_41_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_41_keep_dims_0 = const()[name = string("variance_41_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_41_cast_fp16 = reduce_mean(axes = variance_41_axes_0, keep_dims = variance_41_keep_dims_0, x = inputs_sq_41_cast_fp16)[name = string("variance_41_cast_fp16")];
+            fp16 var_2127_to_fp16 = const()[name = string("op_2127_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2128_cast_fp16 = add(x = variance_41_cast_fp16, y = var_2127_to_fp16)[name = string("op_2128_cast_fp16")];
+            fp32 var_2129_epsilon_0 = const()[name = string("op_2129_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2129_cast_fp16 = rsqrt(epsilon = var_2129_epsilon_0, x = var_2128_cast_fp16)[name = string("op_2129_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_51_cast_fp16 = mul(x = inputs_39_cast_fp16, y = var_2129_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_41_to_fp16 = const()[name = string("w_41_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(251798976)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_45_cast_fp16 = mul(x = w_41_to_fp16, y = hidden_states_51_cast_fp16)[name = string("obj_45_cast_fp16")];
+            string query_31_pad_type_0 = const()[name = string("query_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_31_strides_0 = const()[name = string("query_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_31_pad_0 = const()[name = string("query_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_31_dilations_0 = const()[name = string("query_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_31_groups_0 = const()[name = string("query_31_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_5_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(251803136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(255997504))))[name = string("layers_5_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_31_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_31_dilations_0, groups = query_31_groups_0, pad = query_31_pad_0, pad_type = query_31_pad_type_0, strides = query_31_strides_0, weight = layers_5_self_attn_q_proj_weight_to_fp16_palettized, x = obj_45_cast_fp16)[name = string("query_31_cast_fp16")];
+            string current_key_21_pad_type_0 = const()[name = string("current_key_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_21_strides_0 = const()[name = string("current_key_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_21_pad_0 = const()[name = string("current_key_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_21_dilations_0 = const()[name = string("current_key_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_21_groups_0 = const()[name = string("current_key_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_5_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(255998080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258095296))))[name = string("layers_5_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_21_cast_fp16 = conv(dilations = current_key_21_dilations_0, groups = current_key_21_groups_0, pad = current_key_21_pad_0, pad_type = current_key_21_pad_type_0, strides = current_key_21_strides_0, weight = layers_5_self_attn_k_proj_weight_to_fp16_palettized, x = obj_45_cast_fp16)[name = string("current_key_21_cast_fp16")];
+            string current_value_11_pad_type_0 = const()[name = string("current_value_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_11_strides_0 = const()[name = string("current_value_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_11_pad_0 = const()[name = string("current_value_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_11_dilations_0 = const()[name = string("current_value_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_11_groups_0 = const()[name = string("current_value_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_5_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258095872))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(260193088))))[name = string("layers_5_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_11_dilations_0, groups = current_value_11_groups_0, pad = current_value_11_pad_0, pad_type = current_value_11_pad_type_0, strides = current_value_11_strides_0, weight = layers_5_self_attn_v_proj_weight_to_fp16_palettized, x = obj_45_cast_fp16)[name = string("current_value_11_cast_fp16")];
+            tensor<int32, [4]> var_2166 = const()[name = string("op_2166"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_41_cast_fp16 = reshape(shape = var_2166, x = query_31_cast_fp16)[name = string("inputs_41_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_43_cast_fp16 = mul(x = inputs_41_cast_fp16, y = inputs_41_cast_fp16)[name = string("inputs_sq_43_cast_fp16")];
+            tensor<int32, [1]> variance_43_axes_0 = const()[name = string("variance_43_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_43_keep_dims_0 = const()[name = string("variance_43_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_43_cast_fp16 = reduce_mean(axes = variance_43_axes_0, keep_dims = variance_43_keep_dims_0, x = inputs_sq_43_cast_fp16)[name = string("variance_43_cast_fp16")];
+            fp16 var_2172_to_fp16 = const()[name = string("op_2172_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_2173_cast_fp16 = add(x = variance_43_cast_fp16, y = var_2172_to_fp16)[name = string("op_2173_cast_fp16")];
+            fp32 var_2174_epsilon_0 = const()[name = string("op_2174_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_2174_cast_fp16 = rsqrt(epsilon = var_2174_epsilon_0, x = var_2173_cast_fp16)[name = string("op_2174_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_53_cast_fp16 = mul(x = inputs_41_cast_fp16, y = var_2174_cast_fp16)[name = string("hidden_states_53_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_43_to_fp16 = const()[name = string("w_43_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(260193664)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_11_cast_fp16 = mul(x = w_43_to_fp16, y = hidden_states_53_cast_fp16)[name = string("query_normed_11_cast_fp16")];
+            tensor<int32, [4]> var_2182 = const()[name = string("op_2182"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_43_cast_fp16 = reshape(shape = var_2182, x = current_key_21_cast_fp16)[name = string("inputs_43_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_45_cast_fp16 = mul(x = inputs_43_cast_fp16, y = inputs_43_cast_fp16)[name = string("inputs_sq_45_cast_fp16")];
+            tensor<int32, [1]> variance_45_axes_0 = const()[name = string("variance_45_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_45_keep_dims_0 = const()[name = string("variance_45_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_45_cast_fp16 = reduce_mean(axes = variance_45_axes_0, keep_dims = variance_45_keep_dims_0, x = inputs_sq_45_cast_fp16)[name = string("variance_45_cast_fp16")];
+            fp16 var_2188_to_fp16 = const()[name = string("op_2188_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_2189_cast_fp16 = add(x = variance_45_cast_fp16, y = var_2188_to_fp16)[name = string("op_2189_cast_fp16")];
+            fp32 var_2190_epsilon_0 = const()[name = string("op_2190_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_2190_cast_fp16 = rsqrt(epsilon = var_2190_epsilon_0, x = var_2189_cast_fp16)[name = string("op_2190_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_55_cast_fp16 = mul(x = inputs_43_cast_fp16, y = var_2190_cast_fp16)[name = string("hidden_states_55_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_45_to_fp16 = const()[name = string("w_45_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(260193984)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_11_cast_fp16 = mul(x = w_45_to_fp16, y = hidden_states_55_cast_fp16)[name = string("current_key_normed_11_cast_fp16")];
+            tensor<int32, [4]> var_2208 = const()[name = string("op_2208"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_31_cast_fp16 = reshape(shape = var_2208, x = query_normed_11_cast_fp16)[name = string("mh_q_31_cast_fp16")];
+            tensor<int32, [4]> var_2210 = const()[name = string("op_2210"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_21_cast_fp16 = reshape(shape = var_2210, x = current_key_normed_11_cast_fp16)[name = string("mh_k_21_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2214_cast_fp16 = mul(x = mh_q_31_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2214_cast_fp16")];
+            tensor<int32, [4]> var_2219_begin_0 = const()[name = string("op_2219_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2219_end_0 = const()[name = string("op_2219_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_2219_end_mask_0 = const()[name = string("op_2219_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2219_cast_fp16 = slice_by_index(begin = var_2219_begin_0, end = var_2219_end_0, end_mask = var_2219_end_mask_0, x = mh_q_31_cast_fp16)[name = string("op_2219_cast_fp16")];
+            tensor<int32, [4]> var_2225_begin_0 = const()[name = string("op_2225_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2225_end_0 = const()[name = string("op_2225_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_2225_end_mask_0 = const()[name = string("op_2225_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2225_cast_fp16 = slice_by_index(begin = var_2225_begin_0, end = var_2225_end_0, end_mask = var_2225_end_mask_0, x = mh_q_31_cast_fp16)[name = string("op_2225_cast_fp16")];
+            fp16 const_132_promoted_to_fp16 = const()[name = string("const_132_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_2227_cast_fp16 = mul(x = var_2225_cast_fp16, y = const_132_promoted_to_fp16)[name = string("op_2227_cast_fp16")];
+            bool var_2229_interleave_0 = const()[name = string("op_2229_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_2229_cast_fp16 = concat(axis = var_2107, interleave = var_2229_interleave_0, values = (var_2227_cast_fp16, var_2219_cast_fp16))[name = string("op_2229_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2230_cast_fp16 = mul(x = var_2229_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2230_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_33_cast_fp16 = add(x = var_2214_cast_fp16, y = var_2230_cast_fp16)[name = string("mh_q_33_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2232_cast_fp16 = mul(x = mh_k_21_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2232_cast_fp16")];
+            tensor<int32, [4]> var_2237_begin_0 = const()[name = string("op_2237_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2237_end_0 = const()[name = string("op_2237_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_2237_end_mask_0 = const()[name = string("op_2237_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2237_cast_fp16 = slice_by_index(begin = var_2237_begin_0, end = var_2237_end_0, end_mask = var_2237_end_mask_0, x = mh_k_21_cast_fp16)[name = string("op_2237_cast_fp16")];
+            tensor<int32, [4]> var_2243_begin_0 = const()[name = string("op_2243_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2243_end_0 = const()[name = string("op_2243_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_2243_end_mask_0 = const()[name = string("op_2243_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2243_cast_fp16 = slice_by_index(begin = var_2243_begin_0, end = var_2243_end_0, end_mask = var_2243_end_mask_0, x = mh_k_21_cast_fp16)[name = string("op_2243_cast_fp16")];
+            fp16 const_135_promoted_to_fp16 = const()[name = string("const_135_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_2245_cast_fp16 = mul(x = var_2243_cast_fp16, y = const_135_promoted_to_fp16)[name = string("op_2245_cast_fp16")];
+            bool var_2247_interleave_0 = const()[name = string("op_2247_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_2247_cast_fp16 = concat(axis = var_2107, interleave = var_2247_interleave_0, values = (var_2245_cast_fp16, var_2237_cast_fp16))[name = string("op_2247_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2248_cast_fp16 = mul(x = var_2247_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2248_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_23_cast_fp16 = add(x = var_2232_cast_fp16, y = var_2248_cast_fp16)[name = string("mh_k_23_cast_fp16")];
+            tensor<int32, [4]> var_2252 = const()[name = string("op_2252"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_23_cast_fp16 = reshape(shape = var_2252, x = mh_k_23_cast_fp16)[name = string("current_key_23_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2259_cast_fp16 = mul(x = var_101_cast_fp16_5, y = var_323_cast_fp16)[name = string("op_2259_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2260_cast_fp16 = mul(x = current_key_23_cast_fp16, y = var_321_cast_fp16)[name = string("op_2260_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_33_cast_fp16 = add(x = var_2259_cast_fp16, y = var_2260_cast_fp16)[name = string("key_33_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2263_cast_fp16 = mul(x = var_132_cast_fp16_5, y = var_323_cast_fp16)[name = string("op_2263_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2264_cast_fp16 = mul(x = current_value_11_cast_fp16, y = var_321_cast_fp16)[name = string("op_2264_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_21_cast_fp16 = add(x = var_2263_cast_fp16, y = var_2264_cast_fp16)[name = string("value_21_cast_fp16")];
+            tensor<int32, [4]> var_2268 = const()[name = string("op_2268"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_21_cast_fp16 = reshape(shape = var_2268, x = key_33_cast_fp16)[name = string("key_heads_21_cast_fp16")];
+            tensor<int32, [4]> var_2270 = const()[name = string("op_2270"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_21_cast_fp16 = reshape(shape = var_2270, x = value_21_cast_fp16)[name = string("value_heads_21_cast_fp16")];
+            tensor<int32, [4]> var_2273_begin_0 = const()[name = string("op_2273_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2273_end_0 = const()[name = string("op_2273_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2273_end_mask_0 = const()[name = string("op_2273_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2273_cast_fp16 = slice_by_index(begin = var_2273_begin_0, end = var_2273_end_0, end_mask = var_2273_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2273_cast_fp16")];
+            tensor<int32, [4]> var_2277_begin_0 = const()[name = string("op_2277_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2277_end_0 = const()[name = string("op_2277_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2277_end_mask_0 = const()[name = string("op_2277_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2277_cast_fp16 = slice_by_index(begin = var_2277_begin_0, end = var_2277_end_0, end_mask = var_2277_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2277_cast_fp16")];
+            tensor<int32, [4]> var_2289_begin_0 = const()[name = string("op_2289_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2289_end_0 = const()[name = string("op_2289_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2289_end_mask_0 = const()[name = string("op_2289_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2289_cast_fp16 = slice_by_index(begin = var_2289_begin_0, end = var_2289_end_0, end_mask = var_2289_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2289_cast_fp16")];
+            tensor<int32, [4]> var_2293_begin_0 = const()[name = string("op_2293_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2293_end_0 = const()[name = string("op_2293_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2293_end_mask_0 = const()[name = string("op_2293_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2293_cast_fp16 = slice_by_index(begin = var_2293_begin_0, end = var_2293_end_0, end_mask = var_2293_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2293_cast_fp16")];
+            tensor<int32, [4]> var_2305_begin_0 = const()[name = string("op_2305_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2305_end_0 = const()[name = string("op_2305_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2305_end_mask_0 = const()[name = string("op_2305_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2305_cast_fp16 = slice_by_index(begin = var_2305_begin_0, end = var_2305_end_0, end_mask = var_2305_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2305_cast_fp16")];
+            tensor<int32, [4]> var_2309_begin_0 = const()[name = string("op_2309_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2309_end_0 = const()[name = string("op_2309_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2309_end_mask_0 = const()[name = string("op_2309_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2309_cast_fp16 = slice_by_index(begin = var_2309_begin_0, end = var_2309_end_0, end_mask = var_2309_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2309_cast_fp16")];
+            tensor<int32, [4]> var_2321_begin_0 = const()[name = string("op_2321_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2321_end_0 = const()[name = string("op_2321_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2321_end_mask_0 = const()[name = string("op_2321_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2321_cast_fp16 = slice_by_index(begin = var_2321_begin_0, end = var_2321_end_0, end_mask = var_2321_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2321_cast_fp16")];
+            tensor<int32, [4]> var_2325_begin_0 = const()[name = string("op_2325_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2325_end_0 = const()[name = string("op_2325_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2325_end_mask_0 = const()[name = string("op_2325_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2325_cast_fp16 = slice_by_index(begin = var_2325_begin_0, end = var_2325_end_0, end_mask = var_2325_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2325_cast_fp16")];
+            tensor<int32, [4]> var_2337_begin_0 = const()[name = string("op_2337_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2337_end_0 = const()[name = string("op_2337_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2337_end_mask_0 = const()[name = string("op_2337_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2337_cast_fp16 = slice_by_index(begin = var_2337_begin_0, end = var_2337_end_0, end_mask = var_2337_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2337_cast_fp16")];
+            tensor<int32, [4]> var_2341_begin_0 = const()[name = string("op_2341_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2341_end_0 = const()[name = string("op_2341_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2341_end_mask_0 = const()[name = string("op_2341_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2341_cast_fp16 = slice_by_index(begin = var_2341_begin_0, end = var_2341_end_0, end_mask = var_2341_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2341_cast_fp16")];
+            tensor<int32, [4]> var_2353_begin_0 = const()[name = string("op_2353_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2353_end_0 = const()[name = string("op_2353_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2353_end_mask_0 = const()[name = string("op_2353_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2353_cast_fp16 = slice_by_index(begin = var_2353_begin_0, end = var_2353_end_0, end_mask = var_2353_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2353_cast_fp16")];
+            tensor<int32, [4]> var_2357_begin_0 = const()[name = string("op_2357_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2357_end_0 = const()[name = string("op_2357_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2357_end_mask_0 = const()[name = string("op_2357_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2357_cast_fp16 = slice_by_index(begin = var_2357_begin_0, end = var_2357_end_0, end_mask = var_2357_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2357_cast_fp16")];
+            tensor<int32, [4]> var_2369_begin_0 = const()[name = string("op_2369_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2369_end_0 = const()[name = string("op_2369_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2369_end_mask_0 = const()[name = string("op_2369_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2369_cast_fp16 = slice_by_index(begin = var_2369_begin_0, end = var_2369_end_0, end_mask = var_2369_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2369_cast_fp16")];
+            tensor<int32, [4]> var_2373_begin_0 = const()[name = string("op_2373_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2373_end_0 = const()[name = string("op_2373_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2373_end_mask_0 = const()[name = string("op_2373_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2373_cast_fp16 = slice_by_index(begin = var_2373_begin_0, end = var_2373_end_0, end_mask = var_2373_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2373_cast_fp16")];
+            tensor<int32, [4]> var_2385_begin_0 = const()[name = string("op_2385_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2385_end_0 = const()[name = string("op_2385_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2385_end_mask_0 = const()[name = string("op_2385_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2385_cast_fp16 = slice_by_index(begin = var_2385_begin_0, end = var_2385_end_0, end_mask = var_2385_end_mask_0, x = key_heads_21_cast_fp16)[name = string("op_2385_cast_fp16")];
+            tensor<int32, [4]> var_2389_begin_0 = const()[name = string("op_2389_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2389_end_0 = const()[name = string("op_2389_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2389_end_mask_0 = const()[name = string("op_2389_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2389_cast_fp16 = slice_by_index(begin = var_2389_begin_0, end = var_2389_end_0, end_mask = var_2389_end_mask_0, x = value_heads_21_cast_fp16)[name = string("op_2389_cast_fp16")];
+            bool key_heads_23_interleave_0 = const()[name = string("key_heads_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_23_cast_fp16 = concat(axis = var_2115, interleave = key_heads_23_interleave_0, values = (var_2273_cast_fp16, var_2273_cast_fp16, var_2289_cast_fp16, var_2289_cast_fp16, var_2305_cast_fp16, var_2305_cast_fp16, var_2321_cast_fp16, var_2321_cast_fp16, var_2337_cast_fp16, var_2337_cast_fp16, var_2353_cast_fp16, var_2353_cast_fp16, var_2369_cast_fp16, var_2369_cast_fp16, var_2385_cast_fp16, var_2385_cast_fp16))[name = string("key_heads_23_cast_fp16")];
+            bool value_heads_23_interleave_0 = const()[name = string("value_heads_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_23_cast_fp16 = concat(axis = var_2115, interleave = value_heads_23_interleave_0, values = (var_2277_cast_fp16, var_2277_cast_fp16, var_2293_cast_fp16, var_2293_cast_fp16, var_2309_cast_fp16, var_2309_cast_fp16, var_2325_cast_fp16, var_2325_cast_fp16, var_2341_cast_fp16, var_2341_cast_fp16, var_2357_cast_fp16, var_2357_cast_fp16, var_2373_cast_fp16, var_2373_cast_fp16, var_2389_cast_fp16, var_2389_cast_fp16))[name = string("value_heads_23_cast_fp16")];
+            fp16 var_2412_to_fp16 = const()[name = string("op_2412_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_2413_cast_fp16 = mul(x = mh_q_33_cast_fp16, y = var_2412_to_fp16)[name = string("op_2413_cast_fp16")];
+            bool mh_w_21_transpose_x_0 = const()[name = string("mh_w_21_transpose_x_0"), val = bool(true)];
+            bool mh_w_21_transpose_y_0 = const()[name = string("mh_w_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_21_cast_fp16 = matmul(transpose_x = mh_w_21_transpose_x_0, transpose_y = mh_w_21_transpose_y_0, x = var_2413_cast_fp16, y = key_heads_23_cast_fp16)[name = string("mh_w_21_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_23_cast_fp16 = add(x = mh_w_21_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_23_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_2425_cast_fp16 = softmax(axis = var_2097, x = mh_w_23_cast_fp16)[name = string("op_2425_cast_fp16")];
+            bool attn_11_transpose_x_0 = const()[name = string("attn_11_transpose_x_0"), val = bool(false)];
+            bool attn_11_transpose_y_0 = const()[name = string("attn_11_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_11_cast_fp16 = matmul(transpose_x = attn_11_transpose_x_0, transpose_y = attn_11_transpose_y_0, x = value_heads_23_cast_fp16, y = var_2425_cast_fp16)[name = string("attn_11_cast_fp16")];
+            tensor<int32, [4]> var_2430 = const()[name = string("op_2430"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_41_cast_fp16 = reshape(shape = var_2430, x = attn_11_cast_fp16)[name = string("input_41_cast_fp16")];
+            string obj_51_pad_type_0 = const()[name = string("obj_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_51_strides_0 = const()[name = string("obj_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_51_pad_0 = const()[name = string("obj_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_51_dilations_0 = const()[name = string("obj_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_51_groups_0 = const()[name = string("obj_51_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_5_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(260194304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(264388672))))[name = string("layers_5_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_51_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_51_dilations_0, groups = obj_51_groups_0, pad = obj_51_pad_0, pad_type = obj_51_pad_type_0, strides = obj_51_strides_0, weight = layers_5_self_attn_o_proj_weight_to_fp16_palettized, x = input_41_cast_fp16)[name = string("obj_51_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_45_cast_fp16 = add(x = inputs_39_cast_fp16, y = obj_51_cast_fp16)[name = string("inputs_45_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_47_cast_fp16 = mul(x = inputs_45_cast_fp16, y = inputs_45_cast_fp16)[name = string("inputs_sq_47_cast_fp16")];
+            tensor<int32, [1]> variance_47_axes_0 = const()[name = string("variance_47_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_47_keep_dims_0 = const()[name = string("variance_47_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_47_cast_fp16 = reduce_mean(axes = variance_47_axes_0, keep_dims = variance_47_keep_dims_0, x = inputs_sq_47_cast_fp16)[name = string("variance_47_cast_fp16")];
+            fp16 var_2448_to_fp16 = const()[name = string("op_2448_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2449_cast_fp16 = add(x = variance_47_cast_fp16, y = var_2448_to_fp16)[name = string("op_2449_cast_fp16")];
+            fp32 var_2450_epsilon_0 = const()[name = string("op_2450_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2450_cast_fp16 = rsqrt(epsilon = var_2450_epsilon_0, x = var_2449_cast_fp16)[name = string("op_2450_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_57_cast_fp16 = mul(x = inputs_45_cast_fp16, y = var_2450_cast_fp16)[name = string("hidden_states_57_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_47_to_fp16 = const()[name = string("w_47_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(264389248)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_43_cast_fp16 = mul(x = w_47_to_fp16, y = hidden_states_57_cast_fp16)[name = string("input_43_cast_fp16")];
+            string input_45_pad_type_0 = const()[name = string("input_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_45_strides_0 = const()[name = string("input_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_45_pad_0 = const()[name = string("input_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_45_dilations_0 = const()[name = string("input_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_45_groups_0 = const()[name = string("input_45_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_5_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(264393408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(276976384))))[name = string("layers_5_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_45_cast_fp16 = conv(dilations = input_45_dilations_0, groups = input_45_groups_0, pad = input_45_pad_0, pad_type = input_45_pad_type_0, strides = input_45_strides_0, weight = layers_5_mlp_gate_proj_weight_to_fp16_palettized, x = input_43_cast_fp16)[name = string("input_45_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_2464_cast_fp16 = silu(x = input_45_cast_fp16)[name = string("op_2464_cast_fp16")];
+            string var_2470_pad_type_0 = const()[name = string("op_2470_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2470_strides_0 = const()[name = string("op_2470_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2470_pad_0 = const()[name = string("op_2470_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2470_dilations_0 = const()[name = string("op_2470_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2470_groups_0 = const()[name = string("op_2470_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_5_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(276976960))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(289559936))))[name = string("layers_5_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_2470_cast_fp16 = conv(dilations = var_2470_dilations_0, groups = var_2470_groups_0, pad = var_2470_pad_0, pad_type = var_2470_pad_type_0, strides = var_2470_strides_0, weight = layers_5_mlp_up_proj_weight_to_fp16_palettized, x = input_43_cast_fp16)[name = string("op_2470_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_47_cast_fp16 = mul(x = var_2464_cast_fp16, y = var_2470_cast_fp16)[name = string("input_47_cast_fp16")];
+            string hidden_states_59_pad_type_0 = const()[name = string("hidden_states_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_59_strides_0 = const()[name = string("hidden_states_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_59_pad_0 = const()[name = string("hidden_states_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_59_dilations_0 = const()[name = string("hidden_states_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_59_groups_0 = const()[name = string("hidden_states_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_5_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(289560512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302143488))))[name = string("layers_5_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_59_cast_fp16 = conv(dilations = hidden_states_59_dilations_0, groups = hidden_states_59_groups_0, pad = hidden_states_59_pad_0, pad_type = hidden_states_59_pad_type_0, strides = hidden_states_59_strides_0, weight = layers_5_mlp_down_proj_weight_to_fp16_palettized, x = input_47_cast_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_47_cast_fp16 = add(x = inputs_45_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("inputs_47_cast_fp16")];
+            int32 var_2484 = const()[name = string("op_2484"), val = int32(3)];
+            int32 var_2494 = const()[name = string("op_2494"), val = int32(-2)];
+            int32 var_2502 = const()[name = string("op_2502"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_49_cast_fp16 = mul(x = inputs_47_cast_fp16, y = inputs_47_cast_fp16)[name = string("inputs_sq_49_cast_fp16")];
+            tensor<int32, [1]> variance_49_axes_0 = const()[name = string("variance_49_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_49_keep_dims_0 = const()[name = string("variance_49_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_49_cast_fp16 = reduce_mean(axes = variance_49_axes_0, keep_dims = variance_49_keep_dims_0, x = inputs_sq_49_cast_fp16)[name = string("variance_49_cast_fp16")];
+            fp16 var_2514_to_fp16 = const()[name = string("op_2514_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2515_cast_fp16 = add(x = variance_49_cast_fp16, y = var_2514_to_fp16)[name = string("op_2515_cast_fp16")];
+            fp32 var_2516_epsilon_0 = const()[name = string("op_2516_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2516_cast_fp16 = rsqrt(epsilon = var_2516_epsilon_0, x = var_2515_cast_fp16)[name = string("op_2516_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_61_cast_fp16 = mul(x = inputs_47_cast_fp16, y = var_2516_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_49_to_fp16 = const()[name = string("w_49_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302144064)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_53_cast_fp16 = mul(x = w_49_to_fp16, y = hidden_states_61_cast_fp16)[name = string("obj_53_cast_fp16")];
+            string query_37_pad_type_0 = const()[name = string("query_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_37_strides_0 = const()[name = string("query_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_37_pad_0 = const()[name = string("query_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_37_dilations_0 = const()[name = string("query_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_37_groups_0 = const()[name = string("query_37_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_6_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302148224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(306342592))))[name = string("layers_6_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_37_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_37_dilations_0, groups = query_37_groups_0, pad = query_37_pad_0, pad_type = query_37_pad_type_0, strides = query_37_strides_0, weight = layers_6_self_attn_q_proj_weight_to_fp16_palettized, x = obj_53_cast_fp16)[name = string("query_37_cast_fp16")];
+            string current_key_25_pad_type_0 = const()[name = string("current_key_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_25_strides_0 = const()[name = string("current_key_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_25_pad_0 = const()[name = string("current_key_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_25_dilations_0 = const()[name = string("current_key_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_25_groups_0 = const()[name = string("current_key_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_6_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(306343168))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308440384))))[name = string("layers_6_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_25_cast_fp16 = conv(dilations = current_key_25_dilations_0, groups = current_key_25_groups_0, pad = current_key_25_pad_0, pad_type = current_key_25_pad_type_0, strides = current_key_25_strides_0, weight = layers_6_self_attn_k_proj_weight_to_fp16_palettized, x = obj_53_cast_fp16)[name = string("current_key_25_cast_fp16")];
+            string current_value_13_pad_type_0 = const()[name = string("current_value_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_13_strides_0 = const()[name = string("current_value_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_13_pad_0 = const()[name = string("current_value_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_13_dilations_0 = const()[name = string("current_value_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_13_groups_0 = const()[name = string("current_value_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_6_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308440960))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(310538176))))[name = string("layers_6_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_13_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_13_dilations_0, groups = current_value_13_groups_0, pad = current_value_13_pad_0, pad_type = current_value_13_pad_type_0, strides = current_value_13_strides_0, weight = layers_6_self_attn_v_proj_weight_to_fp16_palettized, x = obj_53_cast_fp16)[name = string("current_value_13_cast_fp16")];
+            tensor<int32, [4]> var_2553 = const()[name = string("op_2553"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_49_cast_fp16 = reshape(shape = var_2553, x = query_37_cast_fp16)[name = string("inputs_49_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_51_cast_fp16 = mul(x = inputs_49_cast_fp16, y = inputs_49_cast_fp16)[name = string("inputs_sq_51_cast_fp16")];
+            tensor<int32, [1]> variance_51_axes_0 = const()[name = string("variance_51_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_51_keep_dims_0 = const()[name = string("variance_51_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_51_cast_fp16 = reduce_mean(axes = variance_51_axes_0, keep_dims = variance_51_keep_dims_0, x = inputs_sq_51_cast_fp16)[name = string("variance_51_cast_fp16")];
+            fp16 var_2559_to_fp16 = const()[name = string("op_2559_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_2560_cast_fp16 = add(x = variance_51_cast_fp16, y = var_2559_to_fp16)[name = string("op_2560_cast_fp16")];
+            fp32 var_2561_epsilon_0 = const()[name = string("op_2561_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_2561_cast_fp16 = rsqrt(epsilon = var_2561_epsilon_0, x = var_2560_cast_fp16)[name = string("op_2561_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_63_cast_fp16 = mul(x = inputs_49_cast_fp16, y = var_2561_cast_fp16)[name = string("hidden_states_63_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_51_to_fp16 = const()[name = string("w_51_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(310538752)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_13_cast_fp16 = mul(x = w_51_to_fp16, y = hidden_states_63_cast_fp16)[name = string("query_normed_13_cast_fp16")];
+            tensor<int32, [4]> var_2569 = const()[name = string("op_2569"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_51_cast_fp16 = reshape(shape = var_2569, x = current_key_25_cast_fp16)[name = string("inputs_51_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_53_cast_fp16 = mul(x = inputs_51_cast_fp16, y = inputs_51_cast_fp16)[name = string("inputs_sq_53_cast_fp16")];
+            tensor<int32, [1]> variance_53_axes_0 = const()[name = string("variance_53_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_53_keep_dims_0 = const()[name = string("variance_53_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_53_cast_fp16 = reduce_mean(axes = variance_53_axes_0, keep_dims = variance_53_keep_dims_0, x = inputs_sq_53_cast_fp16)[name = string("variance_53_cast_fp16")];
+            fp16 var_2575_to_fp16 = const()[name = string("op_2575_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_2576_cast_fp16 = add(x = variance_53_cast_fp16, y = var_2575_to_fp16)[name = string("op_2576_cast_fp16")];
+            fp32 var_2577_epsilon_0 = const()[name = string("op_2577_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_2577_cast_fp16 = rsqrt(epsilon = var_2577_epsilon_0, x = var_2576_cast_fp16)[name = string("op_2577_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_65_cast_fp16 = mul(x = inputs_51_cast_fp16, y = var_2577_cast_fp16)[name = string("hidden_states_65_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_53_to_fp16 = const()[name = string("w_53_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(310539072)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_13_cast_fp16 = mul(x = w_53_to_fp16, y = hidden_states_65_cast_fp16)[name = string("current_key_normed_13_cast_fp16")];
+            tensor<int32, [4]> var_2595 = const()[name = string("op_2595"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_37_cast_fp16 = reshape(shape = var_2595, x = query_normed_13_cast_fp16)[name = string("mh_q_37_cast_fp16")];
+            tensor<int32, [4]> var_2597 = const()[name = string("op_2597"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_25_cast_fp16 = reshape(shape = var_2597, x = current_key_normed_13_cast_fp16)[name = string("mh_k_25_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2601_cast_fp16 = mul(x = mh_q_37_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2601_cast_fp16")];
+            tensor<int32, [4]> var_2606_begin_0 = const()[name = string("op_2606_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2606_end_0 = const()[name = string("op_2606_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_2606_end_mask_0 = const()[name = string("op_2606_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2606_cast_fp16 = slice_by_index(begin = var_2606_begin_0, end = var_2606_end_0, end_mask = var_2606_end_mask_0, x = mh_q_37_cast_fp16)[name = string("op_2606_cast_fp16")];
+            tensor<int32, [4]> var_2612_begin_0 = const()[name = string("op_2612_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2612_end_0 = const()[name = string("op_2612_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_2612_end_mask_0 = const()[name = string("op_2612_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2612_cast_fp16 = slice_by_index(begin = var_2612_begin_0, end = var_2612_end_0, end_mask = var_2612_end_mask_0, x = mh_q_37_cast_fp16)[name = string("op_2612_cast_fp16")];
+            fp16 const_155_promoted_to_fp16 = const()[name = string("const_155_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_2614_cast_fp16 = mul(x = var_2612_cast_fp16, y = const_155_promoted_to_fp16)[name = string("op_2614_cast_fp16")];
+            bool var_2616_interleave_0 = const()[name = string("op_2616_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_2616_cast_fp16 = concat(axis = var_2494, interleave = var_2616_interleave_0, values = (var_2614_cast_fp16, var_2606_cast_fp16))[name = string("op_2616_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2617_cast_fp16 = mul(x = var_2616_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2617_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_39_cast_fp16 = add(x = var_2601_cast_fp16, y = var_2617_cast_fp16)[name = string("mh_q_39_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2619_cast_fp16 = mul(x = mh_k_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2619_cast_fp16")];
+            tensor<int32, [4]> var_2624_begin_0 = const()[name = string("op_2624_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2624_end_0 = const()[name = string("op_2624_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_2624_end_mask_0 = const()[name = string("op_2624_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2624_cast_fp16 = slice_by_index(begin = var_2624_begin_0, end = var_2624_end_0, end_mask = var_2624_end_mask_0, x = mh_k_25_cast_fp16)[name = string("op_2624_cast_fp16")];
+            tensor<int32, [4]> var_2630_begin_0 = const()[name = string("op_2630_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2630_end_0 = const()[name = string("op_2630_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_2630_end_mask_0 = const()[name = string("op_2630_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_2630_cast_fp16 = slice_by_index(begin = var_2630_begin_0, end = var_2630_end_0, end_mask = var_2630_end_mask_0, x = mh_k_25_cast_fp16)[name = string("op_2630_cast_fp16")];
+            fp16 const_158_promoted_to_fp16 = const()[name = string("const_158_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_2632_cast_fp16 = mul(x = var_2630_cast_fp16, y = const_158_promoted_to_fp16)[name = string("op_2632_cast_fp16")];
+            bool var_2634_interleave_0 = const()[name = string("op_2634_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_2634_cast_fp16 = concat(axis = var_2494, interleave = var_2634_interleave_0, values = (var_2632_cast_fp16, var_2624_cast_fp16))[name = string("op_2634_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_2635_cast_fp16 = mul(x = var_2634_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2635_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_27_cast_fp16 = add(x = var_2619_cast_fp16, y = var_2635_cast_fp16)[name = string("mh_k_27_cast_fp16")];
+            tensor<int32, [4]> var_2639 = const()[name = string("op_2639"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_27_cast_fp16 = reshape(shape = var_2639, x = mh_k_27_cast_fp16)[name = string("current_key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2646_cast_fp16 = mul(x = var_101_cast_fp16_6, y = var_323_cast_fp16)[name = string("op_2646_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2647_cast_fp16 = mul(x = current_key_27_cast_fp16, y = var_321_cast_fp16)[name = string("op_2647_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_39_cast_fp16 = add(x = var_2646_cast_fp16, y = var_2647_cast_fp16)[name = string("key_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2650_cast_fp16 = mul(x = var_132_cast_fp16_6, y = var_323_cast_fp16)[name = string("op_2650_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2651_cast_fp16 = mul(x = current_value_13_cast_fp16, y = var_321_cast_fp16)[name = string("op_2651_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_25_cast_fp16 = add(x = var_2650_cast_fp16, y = var_2651_cast_fp16)[name = string("value_25_cast_fp16")];
+            tensor<int32, [4]> var_2655 = const()[name = string("op_2655"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_25_cast_fp16 = reshape(shape = var_2655, x = key_39_cast_fp16)[name = string("key_heads_25_cast_fp16")];
+            tensor<int32, [4]> var_2657 = const()[name = string("op_2657"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_25_cast_fp16 = reshape(shape = var_2657, x = value_25_cast_fp16)[name = string("value_heads_25_cast_fp16")];
+            tensor<int32, [4]> var_2660_begin_0 = const()[name = string("op_2660_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2660_end_0 = const()[name = string("op_2660_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2660_end_mask_0 = const()[name = string("op_2660_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2660_cast_fp16 = slice_by_index(begin = var_2660_begin_0, end = var_2660_end_0, end_mask = var_2660_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2660_cast_fp16")];
+            tensor<int32, [4]> var_2664_begin_0 = const()[name = string("op_2664_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2664_end_0 = const()[name = string("op_2664_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2664_end_mask_0 = const()[name = string("op_2664_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2664_cast_fp16 = slice_by_index(begin = var_2664_begin_0, end = var_2664_end_0, end_mask = var_2664_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2664_cast_fp16")];
+            tensor<int32, [4]> var_2676_begin_0 = const()[name = string("op_2676_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2676_end_0 = const()[name = string("op_2676_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2676_end_mask_0 = const()[name = string("op_2676_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2676_cast_fp16 = slice_by_index(begin = var_2676_begin_0, end = var_2676_end_0, end_mask = var_2676_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2676_cast_fp16")];
+            tensor<int32, [4]> var_2680_begin_0 = const()[name = string("op_2680_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_2680_end_0 = const()[name = string("op_2680_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_2680_end_mask_0 = const()[name = string("op_2680_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2680_cast_fp16 = slice_by_index(begin = var_2680_begin_0, end = var_2680_end_0, end_mask = var_2680_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2680_cast_fp16")];
+            tensor<int32, [4]> var_2692_begin_0 = const()[name = string("op_2692_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2692_end_0 = const()[name = string("op_2692_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2692_end_mask_0 = const()[name = string("op_2692_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2692_cast_fp16 = slice_by_index(begin = var_2692_begin_0, end = var_2692_end_0, end_mask = var_2692_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2692_cast_fp16")];
+            tensor<int32, [4]> var_2696_begin_0 = const()[name = string("op_2696_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_2696_end_0 = const()[name = string("op_2696_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_2696_end_mask_0 = const()[name = string("op_2696_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2696_cast_fp16 = slice_by_index(begin = var_2696_begin_0, end = var_2696_end_0, end_mask = var_2696_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2696_cast_fp16")];
+            tensor<int32, [4]> var_2708_begin_0 = const()[name = string("op_2708_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2708_end_0 = const()[name = string("op_2708_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2708_end_mask_0 = const()[name = string("op_2708_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2708_cast_fp16 = slice_by_index(begin = var_2708_begin_0, end = var_2708_end_0, end_mask = var_2708_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2708_cast_fp16")];
+            tensor<int32, [4]> var_2712_begin_0 = const()[name = string("op_2712_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_2712_end_0 = const()[name = string("op_2712_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_2712_end_mask_0 = const()[name = string("op_2712_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2712_cast_fp16 = slice_by_index(begin = var_2712_begin_0, end = var_2712_end_0, end_mask = var_2712_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2712_cast_fp16")];
+            tensor<int32, [4]> var_2724_begin_0 = const()[name = string("op_2724_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2724_end_0 = const()[name = string("op_2724_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2724_end_mask_0 = const()[name = string("op_2724_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2724_cast_fp16 = slice_by_index(begin = var_2724_begin_0, end = var_2724_end_0, end_mask = var_2724_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2724_cast_fp16")];
+            tensor<int32, [4]> var_2728_begin_0 = const()[name = string("op_2728_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_2728_end_0 = const()[name = string("op_2728_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_2728_end_mask_0 = const()[name = string("op_2728_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2728_cast_fp16 = slice_by_index(begin = var_2728_begin_0, end = var_2728_end_0, end_mask = var_2728_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2728_cast_fp16")];
+            tensor<int32, [4]> var_2740_begin_0 = const()[name = string("op_2740_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2740_end_0 = const()[name = string("op_2740_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2740_end_mask_0 = const()[name = string("op_2740_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2740_cast_fp16 = slice_by_index(begin = var_2740_begin_0, end = var_2740_end_0, end_mask = var_2740_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2740_cast_fp16")];
+            tensor<int32, [4]> var_2744_begin_0 = const()[name = string("op_2744_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_2744_end_0 = const()[name = string("op_2744_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_2744_end_mask_0 = const()[name = string("op_2744_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2744_cast_fp16 = slice_by_index(begin = var_2744_begin_0, end = var_2744_end_0, end_mask = var_2744_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2744_cast_fp16")];
+            tensor<int32, [4]> var_2756_begin_0 = const()[name = string("op_2756_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2756_end_0 = const()[name = string("op_2756_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2756_end_mask_0 = const()[name = string("op_2756_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2756_cast_fp16 = slice_by_index(begin = var_2756_begin_0, end = var_2756_end_0, end_mask = var_2756_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2756_cast_fp16")];
+            tensor<int32, [4]> var_2760_begin_0 = const()[name = string("op_2760_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_2760_end_0 = const()[name = string("op_2760_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_2760_end_mask_0 = const()[name = string("op_2760_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2760_cast_fp16 = slice_by_index(begin = var_2760_begin_0, end = var_2760_end_0, end_mask = var_2760_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2760_cast_fp16")];
+            tensor<int32, [4]> var_2772_begin_0 = const()[name = string("op_2772_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2772_end_0 = const()[name = string("op_2772_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2772_end_mask_0 = const()[name = string("op_2772_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2772_cast_fp16 = slice_by_index(begin = var_2772_begin_0, end = var_2772_end_0, end_mask = var_2772_end_mask_0, x = key_heads_25_cast_fp16)[name = string("op_2772_cast_fp16")];
+            tensor<int32, [4]> var_2776_begin_0 = const()[name = string("op_2776_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_2776_end_0 = const()[name = string("op_2776_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_2776_end_mask_0 = const()[name = string("op_2776_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_2776_cast_fp16 = slice_by_index(begin = var_2776_begin_0, end = var_2776_end_0, end_mask = var_2776_end_mask_0, x = value_heads_25_cast_fp16)[name = string("op_2776_cast_fp16")];
+            bool key_heads_27_interleave_0 = const()[name = string("key_heads_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_27_cast_fp16 = concat(axis = var_2502, interleave = key_heads_27_interleave_0, values = (var_2660_cast_fp16, var_2660_cast_fp16, var_2676_cast_fp16, var_2676_cast_fp16, var_2692_cast_fp16, var_2692_cast_fp16, var_2708_cast_fp16, var_2708_cast_fp16, var_2724_cast_fp16, var_2724_cast_fp16, var_2740_cast_fp16, var_2740_cast_fp16, var_2756_cast_fp16, var_2756_cast_fp16, var_2772_cast_fp16, var_2772_cast_fp16))[name = string("key_heads_27_cast_fp16")];
+            bool value_heads_27_interleave_0 = const()[name = string("value_heads_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_27_cast_fp16 = concat(axis = var_2502, interleave = value_heads_27_interleave_0, values = (var_2664_cast_fp16, var_2664_cast_fp16, var_2680_cast_fp16, var_2680_cast_fp16, var_2696_cast_fp16, var_2696_cast_fp16, var_2712_cast_fp16, var_2712_cast_fp16, var_2728_cast_fp16, var_2728_cast_fp16, var_2744_cast_fp16, var_2744_cast_fp16, var_2760_cast_fp16, var_2760_cast_fp16, var_2776_cast_fp16, var_2776_cast_fp16))[name = string("value_heads_27_cast_fp16")];
+            fp16 var_2799_to_fp16 = const()[name = string("op_2799_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_2800_cast_fp16 = mul(x = mh_q_39_cast_fp16, y = var_2799_to_fp16)[name = string("op_2800_cast_fp16")];
+            bool mh_w_25_transpose_x_0 = const()[name = string("mh_w_25_transpose_x_0"), val = bool(true)];
+            bool mh_w_25_transpose_y_0 = const()[name = string("mh_w_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_25_cast_fp16 = matmul(transpose_x = mh_w_25_transpose_x_0, transpose_y = mh_w_25_transpose_y_0, x = var_2800_cast_fp16, y = key_heads_27_cast_fp16)[name = string("mh_w_25_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_27_cast_fp16 = add(x = mh_w_25_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_27_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_2812_cast_fp16 = softmax(axis = var_2484, x = mh_w_27_cast_fp16)[name = string("op_2812_cast_fp16")];
+            bool attn_13_transpose_x_0 = const()[name = string("attn_13_transpose_x_0"), val = bool(false)];
+            bool attn_13_transpose_y_0 = const()[name = string("attn_13_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_13_cast_fp16 = matmul(transpose_x = attn_13_transpose_x_0, transpose_y = attn_13_transpose_y_0, x = value_heads_27_cast_fp16, y = var_2812_cast_fp16)[name = string("attn_13_cast_fp16")];
+            tensor<int32, [4]> var_2817 = const()[name = string("op_2817"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_49_cast_fp16 = reshape(shape = var_2817, x = attn_13_cast_fp16)[name = string("input_49_cast_fp16")];
+            string obj_59_pad_type_0 = const()[name = string("obj_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_59_strides_0 = const()[name = string("obj_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_59_pad_0 = const()[name = string("obj_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_59_dilations_0 = const()[name = string("obj_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_59_groups_0 = const()[name = string("obj_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_6_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(310539392))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314733760))))[name = string("layers_6_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_59_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_59_dilations_0, groups = obj_59_groups_0, pad = obj_59_pad_0, pad_type = obj_59_pad_type_0, strides = obj_59_strides_0, weight = layers_6_self_attn_o_proj_weight_to_fp16_palettized, x = input_49_cast_fp16)[name = string("obj_59_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_53_cast_fp16 = add(x = inputs_47_cast_fp16, y = obj_59_cast_fp16)[name = string("inputs_53_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_55_cast_fp16 = mul(x = inputs_53_cast_fp16, y = inputs_53_cast_fp16)[name = string("inputs_sq_55_cast_fp16")];
+            tensor<int32, [1]> variance_55_axes_0 = const()[name = string("variance_55_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_55_keep_dims_0 = const()[name = string("variance_55_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_55_cast_fp16 = reduce_mean(axes = variance_55_axes_0, keep_dims = variance_55_keep_dims_0, x = inputs_sq_55_cast_fp16)[name = string("variance_55_cast_fp16")];
+            fp16 var_2835_to_fp16 = const()[name = string("op_2835_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2836_cast_fp16 = add(x = variance_55_cast_fp16, y = var_2835_to_fp16)[name = string("op_2836_cast_fp16")];
+            fp32 var_2837_epsilon_0 = const()[name = string("op_2837_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2837_cast_fp16 = rsqrt(epsilon = var_2837_epsilon_0, x = var_2836_cast_fp16)[name = string("op_2837_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_67_cast_fp16 = mul(x = inputs_53_cast_fp16, y = var_2837_cast_fp16)[name = string("hidden_states_67_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_55_to_fp16 = const()[name = string("w_55_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314734336)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_51_cast_fp16 = mul(x = w_55_to_fp16, y = hidden_states_67_cast_fp16)[name = string("input_51_cast_fp16")];
+            string input_53_pad_type_0 = const()[name = string("input_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_53_strides_0 = const()[name = string("input_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_53_pad_0 = const()[name = string("input_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_53_dilations_0 = const()[name = string("input_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_53_groups_0 = const()[name = string("input_53_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_6_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314738496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327321472))))[name = string("layers_6_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_53_cast_fp16 = conv(dilations = input_53_dilations_0, groups = input_53_groups_0, pad = input_53_pad_0, pad_type = input_53_pad_type_0, strides = input_53_strides_0, weight = layers_6_mlp_gate_proj_weight_to_fp16_palettized, x = input_51_cast_fp16)[name = string("input_53_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_2851_cast_fp16 = silu(x = input_53_cast_fp16)[name = string("op_2851_cast_fp16")];
+            string var_2857_pad_type_0 = const()[name = string("op_2857_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2857_strides_0 = const()[name = string("op_2857_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2857_pad_0 = const()[name = string("op_2857_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2857_dilations_0 = const()[name = string("op_2857_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2857_groups_0 = const()[name = string("op_2857_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_6_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327322048))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339905024))))[name = string("layers_6_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_2857_cast_fp16 = conv(dilations = var_2857_dilations_0, groups = var_2857_groups_0, pad = var_2857_pad_0, pad_type = var_2857_pad_type_0, strides = var_2857_strides_0, weight = layers_6_mlp_up_proj_weight_to_fp16_palettized, x = input_51_cast_fp16)[name = string("op_2857_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_55_cast_fp16 = mul(x = var_2851_cast_fp16, y = var_2857_cast_fp16)[name = string("input_55_cast_fp16")];
+            string hidden_states_69_pad_type_0 = const()[name = string("hidden_states_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_69_strides_0 = const()[name = string("hidden_states_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_69_pad_0 = const()[name = string("hidden_states_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_69_dilations_0 = const()[name = string("hidden_states_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_69_groups_0 = const()[name = string("hidden_states_69_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_6_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339905600))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352488576))))[name = string("layers_6_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_69_cast_fp16 = conv(dilations = hidden_states_69_dilations_0, groups = hidden_states_69_groups_0, pad = hidden_states_69_pad_0, pad_type = hidden_states_69_pad_type_0, strides = hidden_states_69_strides_0, weight = layers_6_mlp_down_proj_weight_to_fp16_palettized, x = input_55_cast_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_55_cast_fp16 = add(x = inputs_53_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("inputs_55_cast_fp16")];
+            int32 var_2871 = const()[name = string("op_2871"), val = int32(3)];
+            int32 var_2881 = const()[name = string("op_2881"), val = int32(-2)];
+            int32 var_2889 = const()[name = string("op_2889"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_57_cast_fp16 = mul(x = inputs_55_cast_fp16, y = inputs_55_cast_fp16)[name = string("inputs_sq_57_cast_fp16")];
+            tensor<int32, [1]> variance_57_axes_0 = const()[name = string("variance_57_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_57_keep_dims_0 = const()[name = string("variance_57_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_57_cast_fp16 = reduce_mean(axes = variance_57_axes_0, keep_dims = variance_57_keep_dims_0, x = inputs_sq_57_cast_fp16)[name = string("variance_57_cast_fp16")];
+            fp16 var_2901_to_fp16 = const()[name = string("op_2901_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2902_cast_fp16 = add(x = variance_57_cast_fp16, y = var_2901_to_fp16)[name = string("op_2902_cast_fp16")];
+            fp32 var_2903_epsilon_0 = const()[name = string("op_2903_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2903_cast_fp16 = rsqrt(epsilon = var_2903_epsilon_0, x = var_2902_cast_fp16)[name = string("op_2903_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_71_cast_fp16 = mul(x = inputs_55_cast_fp16, y = var_2903_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_57_to_fp16 = const()[name = string("w_57_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352489152)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_61_cast_fp16 = mul(x = w_57_to_fp16, y = hidden_states_71_cast_fp16)[name = string("obj_61_cast_fp16")];
+            string query_43_pad_type_0 = const()[name = string("query_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_43_strides_0 = const()[name = string("query_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_43_pad_0 = const()[name = string("query_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_43_dilations_0 = const()[name = string("query_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_43_groups_0 = const()[name = string("query_43_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_7_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352493312))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356687680))))[name = string("layers_7_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_43_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_43_dilations_0, groups = query_43_groups_0, pad = query_43_pad_0, pad_type = query_43_pad_type_0, strides = query_43_strides_0, weight = layers_7_self_attn_q_proj_weight_to_fp16_palettized, x = obj_61_cast_fp16)[name = string("query_43_cast_fp16")];
+            string current_key_29_pad_type_0 = const()[name = string("current_key_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_29_strides_0 = const()[name = string("current_key_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_29_pad_0 = const()[name = string("current_key_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_29_dilations_0 = const()[name = string("current_key_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_29_groups_0 = const()[name = string("current_key_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_7_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356688256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(358785472))))[name = string("layers_7_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_29_cast_fp16 = conv(dilations = current_key_29_dilations_0, groups = current_key_29_groups_0, pad = current_key_29_pad_0, pad_type = current_key_29_pad_type_0, strides = current_key_29_strides_0, weight = layers_7_self_attn_k_proj_weight_to_fp16_palettized, x = obj_61_cast_fp16)[name = string("current_key_29_cast_fp16")];
+            string current_value_15_pad_type_0 = const()[name = string("current_value_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_15_strides_0 = const()[name = string("current_value_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_15_pad_0 = const()[name = string("current_value_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_15_dilations_0 = const()[name = string("current_value_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_15_groups_0 = const()[name = string("current_value_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_7_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(358786048))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(360883264))))[name = string("layers_7_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_15_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_15_dilations_0, groups = current_value_15_groups_0, pad = current_value_15_pad_0, pad_type = current_value_15_pad_type_0, strides = current_value_15_strides_0, weight = layers_7_self_attn_v_proj_weight_to_fp16_palettized, x = obj_61_cast_fp16)[name = string("current_value_15_cast_fp16")];
+            tensor<int32, [4]> var_2940 = const()[name = string("op_2940"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_57_cast_fp16 = reshape(shape = var_2940, x = query_43_cast_fp16)[name = string("inputs_57_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_59_cast_fp16 = mul(x = inputs_57_cast_fp16, y = inputs_57_cast_fp16)[name = string("inputs_sq_59_cast_fp16")];
+            tensor<int32, [1]> variance_59_axes_0 = const()[name = string("variance_59_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_59_keep_dims_0 = const()[name = string("variance_59_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_59_cast_fp16 = reduce_mean(axes = variance_59_axes_0, keep_dims = variance_59_keep_dims_0, x = inputs_sq_59_cast_fp16)[name = string("variance_59_cast_fp16")];
+            fp16 var_2946_to_fp16 = const()[name = string("op_2946_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_2947_cast_fp16 = add(x = variance_59_cast_fp16, y = var_2946_to_fp16)[name = string("op_2947_cast_fp16")];
+            fp32 var_2948_epsilon_0 = const()[name = string("op_2948_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_2948_cast_fp16 = rsqrt(epsilon = var_2948_epsilon_0, x = var_2947_cast_fp16)[name = string("op_2948_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_73_cast_fp16 = mul(x = inputs_57_cast_fp16, y = var_2948_cast_fp16)[name = string("hidden_states_73_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_59_to_fp16 = const()[name = string("w_59_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(360883840)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_15_cast_fp16 = mul(x = w_59_to_fp16, y = hidden_states_73_cast_fp16)[name = string("query_normed_15_cast_fp16")];
+            tensor<int32, [4]> var_2956 = const()[name = string("op_2956"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_59_cast_fp16 = reshape(shape = var_2956, x = current_key_29_cast_fp16)[name = string("inputs_59_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_61_cast_fp16 = mul(x = inputs_59_cast_fp16, y = inputs_59_cast_fp16)[name = string("inputs_sq_61_cast_fp16")];
+            tensor<int32, [1]> variance_61_axes_0 = const()[name = string("variance_61_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_61_keep_dims_0 = const()[name = string("variance_61_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_61_cast_fp16 = reduce_mean(axes = variance_61_axes_0, keep_dims = variance_61_keep_dims_0, x = inputs_sq_61_cast_fp16)[name = string("variance_61_cast_fp16")];
+            fp16 var_2962_to_fp16 = const()[name = string("op_2962_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_2963_cast_fp16 = add(x = variance_61_cast_fp16, y = var_2962_to_fp16)[name = string("op_2963_cast_fp16")];
+            fp32 var_2964_epsilon_0 = const()[name = string("op_2964_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_2964_cast_fp16 = rsqrt(epsilon = var_2964_epsilon_0, x = var_2963_cast_fp16)[name = string("op_2964_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_75_cast_fp16 = mul(x = inputs_59_cast_fp16, y = var_2964_cast_fp16)[name = string("hidden_states_75_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_61_to_fp16 = const()[name = string("w_61_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(360884160)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_15_cast_fp16 = mul(x = w_61_to_fp16, y = hidden_states_75_cast_fp16)[name = string("current_key_normed_15_cast_fp16")];
+            tensor<int32, [4]> var_2982 = const()[name = string("op_2982"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_43_cast_fp16 = reshape(shape = var_2982, x = query_normed_15_cast_fp16)[name = string("mh_q_43_cast_fp16")];
+            tensor<int32, [4]> var_2984 = const()[name = string("op_2984"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_29_cast_fp16 = reshape(shape = var_2984, x = current_key_normed_15_cast_fp16)[name = string("mh_k_29_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_2988_cast_fp16 = mul(x = mh_q_43_cast_fp16, y = cos_1_cast_fp16)[name = string("op_2988_cast_fp16")];
+            tensor<int32, [4]> var_2993_begin_0 = const()[name = string("op_2993_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2993_end_0 = const()[name = string("op_2993_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_2993_end_mask_0 = const()[name = string("op_2993_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2993_cast_fp16 = slice_by_index(begin = var_2993_begin_0, end = var_2993_end_0, end_mask = var_2993_end_mask_0, x = mh_q_43_cast_fp16)[name = string("op_2993_cast_fp16")];
+            tensor<int32, [4]> var_2999_begin_0 = const()[name = string("op_2999_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2999_end_0 = const()[name = string("op_2999_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_2999_end_mask_0 = const()[name = string("op_2999_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_2999_cast_fp16 = slice_by_index(begin = var_2999_begin_0, end = var_2999_end_0, end_mask = var_2999_end_mask_0, x = mh_q_43_cast_fp16)[name = string("op_2999_cast_fp16")];
+            fp16 const_178_promoted_to_fp16 = const()[name = string("const_178_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_3001_cast_fp16 = mul(x = var_2999_cast_fp16, y = const_178_promoted_to_fp16)[name = string("op_3001_cast_fp16")];
+            bool var_3003_interleave_0 = const()[name = string("op_3003_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_3003_cast_fp16 = concat(axis = var_2881, interleave = var_3003_interleave_0, values = (var_3001_cast_fp16, var_2993_cast_fp16))[name = string("op_3003_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3004_cast_fp16 = mul(x = var_3003_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3004_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_45_cast_fp16 = add(x = var_2988_cast_fp16, y = var_3004_cast_fp16)[name = string("mh_q_45_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3006_cast_fp16 = mul(x = mh_k_29_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3006_cast_fp16")];
+            tensor<int32, [4]> var_3011_begin_0 = const()[name = string("op_3011_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3011_end_0 = const()[name = string("op_3011_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_3011_end_mask_0 = const()[name = string("op_3011_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3011_cast_fp16 = slice_by_index(begin = var_3011_begin_0, end = var_3011_end_0, end_mask = var_3011_end_mask_0, x = mh_k_29_cast_fp16)[name = string("op_3011_cast_fp16")];
+            tensor<int32, [4]> var_3017_begin_0 = const()[name = string("op_3017_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3017_end_0 = const()[name = string("op_3017_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_3017_end_mask_0 = const()[name = string("op_3017_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3017_cast_fp16 = slice_by_index(begin = var_3017_begin_0, end = var_3017_end_0, end_mask = var_3017_end_mask_0, x = mh_k_29_cast_fp16)[name = string("op_3017_cast_fp16")];
+            fp16 const_181_promoted_to_fp16 = const()[name = string("const_181_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_3019_cast_fp16 = mul(x = var_3017_cast_fp16, y = const_181_promoted_to_fp16)[name = string("op_3019_cast_fp16")];
+            bool var_3021_interleave_0 = const()[name = string("op_3021_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_3021_cast_fp16 = concat(axis = var_2881, interleave = var_3021_interleave_0, values = (var_3019_cast_fp16, var_3011_cast_fp16))[name = string("op_3021_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3022_cast_fp16 = mul(x = var_3021_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3022_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_31_cast_fp16 = add(x = var_3006_cast_fp16, y = var_3022_cast_fp16)[name = string("mh_k_31_cast_fp16")];
+            tensor<int32, [4]> var_3026 = const()[name = string("op_3026"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_31_cast_fp16 = reshape(shape = var_3026, x = mh_k_31_cast_fp16)[name = string("current_key_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3033_cast_fp16 = mul(x = var_101_cast_fp16_7, y = var_323_cast_fp16)[name = string("op_3033_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3034_cast_fp16 = mul(x = current_key_31_cast_fp16, y = var_321_cast_fp16)[name = string("op_3034_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_45_cast_fp16 = add(x = var_3033_cast_fp16, y = var_3034_cast_fp16)[name = string("key_45_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3037_cast_fp16 = mul(x = var_132_cast_fp16_7, y = var_323_cast_fp16)[name = string("op_3037_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3038_cast_fp16 = mul(x = current_value_15_cast_fp16, y = var_321_cast_fp16)[name = string("op_3038_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_29_cast_fp16 = add(x = var_3037_cast_fp16, y = var_3038_cast_fp16)[name = string("value_29_cast_fp16")];
+            tensor<int32, [4]> var_3042 = const()[name = string("op_3042"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_29_cast_fp16 = reshape(shape = var_3042, x = key_45_cast_fp16)[name = string("key_heads_29_cast_fp16")];
+            tensor<int32, [4]> var_3044 = const()[name = string("op_3044"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_29_cast_fp16 = reshape(shape = var_3044, x = value_29_cast_fp16)[name = string("value_heads_29_cast_fp16")];
+            tensor<int32, [4]> var_3047_begin_0 = const()[name = string("op_3047_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3047_end_0 = const()[name = string("op_3047_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3047_end_mask_0 = const()[name = string("op_3047_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3047_cast_fp16 = slice_by_index(begin = var_3047_begin_0, end = var_3047_end_0, end_mask = var_3047_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3047_cast_fp16")];
+            tensor<int32, [4]> var_3051_begin_0 = const()[name = string("op_3051_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3051_end_0 = const()[name = string("op_3051_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3051_end_mask_0 = const()[name = string("op_3051_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3051_cast_fp16 = slice_by_index(begin = var_3051_begin_0, end = var_3051_end_0, end_mask = var_3051_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3051_cast_fp16")];
+            tensor<int32, [4]> var_3063_begin_0 = const()[name = string("op_3063_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3063_end_0 = const()[name = string("op_3063_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3063_end_mask_0 = const()[name = string("op_3063_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3063_cast_fp16 = slice_by_index(begin = var_3063_begin_0, end = var_3063_end_0, end_mask = var_3063_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3063_cast_fp16")];
+            tensor<int32, [4]> var_3067_begin_0 = const()[name = string("op_3067_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3067_end_0 = const()[name = string("op_3067_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3067_end_mask_0 = const()[name = string("op_3067_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3067_cast_fp16 = slice_by_index(begin = var_3067_begin_0, end = var_3067_end_0, end_mask = var_3067_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3067_cast_fp16")];
+            tensor<int32, [4]> var_3079_begin_0 = const()[name = string("op_3079_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3079_end_0 = const()[name = string("op_3079_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3079_end_mask_0 = const()[name = string("op_3079_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3079_cast_fp16 = slice_by_index(begin = var_3079_begin_0, end = var_3079_end_0, end_mask = var_3079_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3079_cast_fp16")];
+            tensor<int32, [4]> var_3083_begin_0 = const()[name = string("op_3083_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3083_end_0 = const()[name = string("op_3083_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3083_end_mask_0 = const()[name = string("op_3083_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3083_cast_fp16 = slice_by_index(begin = var_3083_begin_0, end = var_3083_end_0, end_mask = var_3083_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3083_cast_fp16")];
+            tensor<int32, [4]> var_3095_begin_0 = const()[name = string("op_3095_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3095_end_0 = const()[name = string("op_3095_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3095_end_mask_0 = const()[name = string("op_3095_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3095_cast_fp16 = slice_by_index(begin = var_3095_begin_0, end = var_3095_end_0, end_mask = var_3095_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3095_cast_fp16")];
+            tensor<int32, [4]> var_3099_begin_0 = const()[name = string("op_3099_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3099_end_0 = const()[name = string("op_3099_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3099_end_mask_0 = const()[name = string("op_3099_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3099_cast_fp16 = slice_by_index(begin = var_3099_begin_0, end = var_3099_end_0, end_mask = var_3099_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3099_cast_fp16")];
+            tensor<int32, [4]> var_3111_begin_0 = const()[name = string("op_3111_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3111_end_0 = const()[name = string("op_3111_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3111_end_mask_0 = const()[name = string("op_3111_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3111_cast_fp16 = slice_by_index(begin = var_3111_begin_0, end = var_3111_end_0, end_mask = var_3111_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3111_cast_fp16")];
+            tensor<int32, [4]> var_3115_begin_0 = const()[name = string("op_3115_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3115_end_0 = const()[name = string("op_3115_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3115_end_mask_0 = const()[name = string("op_3115_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3115_cast_fp16 = slice_by_index(begin = var_3115_begin_0, end = var_3115_end_0, end_mask = var_3115_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3115_cast_fp16")];
+            tensor<int32, [4]> var_3127_begin_0 = const()[name = string("op_3127_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3127_end_0 = const()[name = string("op_3127_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3127_end_mask_0 = const()[name = string("op_3127_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3127_cast_fp16 = slice_by_index(begin = var_3127_begin_0, end = var_3127_end_0, end_mask = var_3127_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3127_cast_fp16")];
+            tensor<int32, [4]> var_3131_begin_0 = const()[name = string("op_3131_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3131_end_0 = const()[name = string("op_3131_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3131_end_mask_0 = const()[name = string("op_3131_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3131_cast_fp16 = slice_by_index(begin = var_3131_begin_0, end = var_3131_end_0, end_mask = var_3131_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3131_cast_fp16")];
+            tensor<int32, [4]> var_3143_begin_0 = const()[name = string("op_3143_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3143_end_0 = const()[name = string("op_3143_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3143_end_mask_0 = const()[name = string("op_3143_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3143_cast_fp16 = slice_by_index(begin = var_3143_begin_0, end = var_3143_end_0, end_mask = var_3143_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3143_cast_fp16")];
+            tensor<int32, [4]> var_3147_begin_0 = const()[name = string("op_3147_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3147_end_0 = const()[name = string("op_3147_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3147_end_mask_0 = const()[name = string("op_3147_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3147_cast_fp16 = slice_by_index(begin = var_3147_begin_0, end = var_3147_end_0, end_mask = var_3147_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3147_cast_fp16")];
+            tensor<int32, [4]> var_3159_begin_0 = const()[name = string("op_3159_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3159_end_0 = const()[name = string("op_3159_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3159_end_mask_0 = const()[name = string("op_3159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3159_cast_fp16 = slice_by_index(begin = var_3159_begin_0, end = var_3159_end_0, end_mask = var_3159_end_mask_0, x = key_heads_29_cast_fp16)[name = string("op_3159_cast_fp16")];
+            tensor<int32, [4]> var_3163_begin_0 = const()[name = string("op_3163_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3163_end_0 = const()[name = string("op_3163_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3163_end_mask_0 = const()[name = string("op_3163_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3163_cast_fp16 = slice_by_index(begin = var_3163_begin_0, end = var_3163_end_0, end_mask = var_3163_end_mask_0, x = value_heads_29_cast_fp16)[name = string("op_3163_cast_fp16")];
+            bool key_heads_31_interleave_0 = const()[name = string("key_heads_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_31_cast_fp16 = concat(axis = var_2889, interleave = key_heads_31_interleave_0, values = (var_3047_cast_fp16, var_3047_cast_fp16, var_3063_cast_fp16, var_3063_cast_fp16, var_3079_cast_fp16, var_3079_cast_fp16, var_3095_cast_fp16, var_3095_cast_fp16, var_3111_cast_fp16, var_3111_cast_fp16, var_3127_cast_fp16, var_3127_cast_fp16, var_3143_cast_fp16, var_3143_cast_fp16, var_3159_cast_fp16, var_3159_cast_fp16))[name = string("key_heads_31_cast_fp16")];
+            bool value_heads_31_interleave_0 = const()[name = string("value_heads_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_31_cast_fp16 = concat(axis = var_2889, interleave = value_heads_31_interleave_0, values = (var_3051_cast_fp16, var_3051_cast_fp16, var_3067_cast_fp16, var_3067_cast_fp16, var_3083_cast_fp16, var_3083_cast_fp16, var_3099_cast_fp16, var_3099_cast_fp16, var_3115_cast_fp16, var_3115_cast_fp16, var_3131_cast_fp16, var_3131_cast_fp16, var_3147_cast_fp16, var_3147_cast_fp16, var_3163_cast_fp16, var_3163_cast_fp16))[name = string("value_heads_31_cast_fp16")];
+            fp16 var_3186_to_fp16 = const()[name = string("op_3186_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_3187_cast_fp16 = mul(x = mh_q_45_cast_fp16, y = var_3186_to_fp16)[name = string("op_3187_cast_fp16")];
+            bool mh_w_29_transpose_x_0 = const()[name = string("mh_w_29_transpose_x_0"), val = bool(true)];
+            bool mh_w_29_transpose_y_0 = const()[name = string("mh_w_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_29_cast_fp16 = matmul(transpose_x = mh_w_29_transpose_x_0, transpose_y = mh_w_29_transpose_y_0, x = var_3187_cast_fp16, y = key_heads_31_cast_fp16)[name = string("mh_w_29_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_31_cast_fp16 = add(x = mh_w_29_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_31_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_3199_cast_fp16 = softmax(axis = var_2871, x = mh_w_31_cast_fp16)[name = string("op_3199_cast_fp16")];
+            bool attn_15_transpose_x_0 = const()[name = string("attn_15_transpose_x_0"), val = bool(false)];
+            bool attn_15_transpose_y_0 = const()[name = string("attn_15_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_15_cast_fp16 = matmul(transpose_x = attn_15_transpose_x_0, transpose_y = attn_15_transpose_y_0, x = value_heads_31_cast_fp16, y = var_3199_cast_fp16)[name = string("attn_15_cast_fp16")];
+            tensor<int32, [4]> var_3204 = const()[name = string("op_3204"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_57_cast_fp16 = reshape(shape = var_3204, x = attn_15_cast_fp16)[name = string("input_57_cast_fp16")];
+            string obj_67_pad_type_0 = const()[name = string("obj_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_67_strides_0 = const()[name = string("obj_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_67_pad_0 = const()[name = string("obj_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_67_dilations_0 = const()[name = string("obj_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_67_groups_0 = const()[name = string("obj_67_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_7_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(360884480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(365078848))))[name = string("layers_7_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_67_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_67_dilations_0, groups = obj_67_groups_0, pad = obj_67_pad_0, pad_type = obj_67_pad_type_0, strides = obj_67_strides_0, weight = layers_7_self_attn_o_proj_weight_to_fp16_palettized, x = input_57_cast_fp16)[name = string("obj_67_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_61_cast_fp16 = add(x = inputs_55_cast_fp16, y = obj_67_cast_fp16)[name = string("inputs_61_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_63_cast_fp16 = mul(x = inputs_61_cast_fp16, y = inputs_61_cast_fp16)[name = string("inputs_sq_63_cast_fp16")];
+            tensor<int32, [1]> variance_63_axes_0 = const()[name = string("variance_63_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_63_keep_dims_0 = const()[name = string("variance_63_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_63_cast_fp16 = reduce_mean(axes = variance_63_axes_0, keep_dims = variance_63_keep_dims_0, x = inputs_sq_63_cast_fp16)[name = string("variance_63_cast_fp16")];
+            fp16 var_3222_to_fp16 = const()[name = string("op_3222_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3223_cast_fp16 = add(x = variance_63_cast_fp16, y = var_3222_to_fp16)[name = string("op_3223_cast_fp16")];
+            fp32 var_3224_epsilon_0 = const()[name = string("op_3224_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3224_cast_fp16 = rsqrt(epsilon = var_3224_epsilon_0, x = var_3223_cast_fp16)[name = string("op_3224_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_77_cast_fp16 = mul(x = inputs_61_cast_fp16, y = var_3224_cast_fp16)[name = string("hidden_states_77_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_63_to_fp16 = const()[name = string("w_63_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(365079424)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_59_cast_fp16 = mul(x = w_63_to_fp16, y = hidden_states_77_cast_fp16)[name = string("input_59_cast_fp16")];
+            string input_61_pad_type_0 = const()[name = string("input_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_61_strides_0 = const()[name = string("input_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_61_pad_0 = const()[name = string("input_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_61_dilations_0 = const()[name = string("input_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_61_groups_0 = const()[name = string("input_61_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_7_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(365083584))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(377666560))))[name = string("layers_7_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_61_cast_fp16 = conv(dilations = input_61_dilations_0, groups = input_61_groups_0, pad = input_61_pad_0, pad_type = input_61_pad_type_0, strides = input_61_strides_0, weight = layers_7_mlp_gate_proj_weight_to_fp16_palettized, x = input_59_cast_fp16)[name = string("input_61_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_3238_cast_fp16 = silu(x = input_61_cast_fp16)[name = string("op_3238_cast_fp16")];
+            string var_3244_pad_type_0 = const()[name = string("op_3244_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3244_strides_0 = const()[name = string("op_3244_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3244_pad_0 = const()[name = string("op_3244_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3244_dilations_0 = const()[name = string("op_3244_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3244_groups_0 = const()[name = string("op_3244_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_7_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(377667136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(390250112))))[name = string("layers_7_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_3244_cast_fp16 = conv(dilations = var_3244_dilations_0, groups = var_3244_groups_0, pad = var_3244_pad_0, pad_type = var_3244_pad_type_0, strides = var_3244_strides_0, weight = layers_7_mlp_up_proj_weight_to_fp16_palettized, x = input_59_cast_fp16)[name = string("op_3244_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_63_cast_fp16 = mul(x = var_3238_cast_fp16, y = var_3244_cast_fp16)[name = string("input_63_cast_fp16")];
+            string hidden_states_79_pad_type_0 = const()[name = string("hidden_states_79_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_79_strides_0 = const()[name = string("hidden_states_79_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_79_pad_0 = const()[name = string("hidden_states_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_79_dilations_0 = const()[name = string("hidden_states_79_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_79_groups_0 = const()[name = string("hidden_states_79_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_7_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(390250688))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(402833664))))[name = string("layers_7_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_79_cast_fp16 = conv(dilations = hidden_states_79_dilations_0, groups = hidden_states_79_groups_0, pad = hidden_states_79_pad_0, pad_type = hidden_states_79_pad_type_0, strides = hidden_states_79_strides_0, weight = layers_7_mlp_down_proj_weight_to_fp16_palettized, x = input_63_cast_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_63_cast_fp16 = add(x = inputs_61_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("inputs_63_cast_fp16")];
+            int32 var_3258 = const()[name = string("op_3258"), val = int32(3)];
+            int32 var_3268 = const()[name = string("op_3268"), val = int32(-2)];
+            int32 var_3276 = const()[name = string("op_3276"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_65_cast_fp16 = mul(x = inputs_63_cast_fp16, y = inputs_63_cast_fp16)[name = string("inputs_sq_65_cast_fp16")];
+            tensor<int32, [1]> variance_65_axes_0 = const()[name = string("variance_65_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_65_keep_dims_0 = const()[name = string("variance_65_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_65_cast_fp16 = reduce_mean(axes = variance_65_axes_0, keep_dims = variance_65_keep_dims_0, x = inputs_sq_65_cast_fp16)[name = string("variance_65_cast_fp16")];
+            fp16 var_3288_to_fp16 = const()[name = string("op_3288_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3289_cast_fp16 = add(x = variance_65_cast_fp16, y = var_3288_to_fp16)[name = string("op_3289_cast_fp16")];
+            fp32 var_3290_epsilon_0 = const()[name = string("op_3290_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3290_cast_fp16 = rsqrt(epsilon = var_3290_epsilon_0, x = var_3289_cast_fp16)[name = string("op_3290_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_81_cast_fp16 = mul(x = inputs_63_cast_fp16, y = var_3290_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_65_to_fp16 = const()[name = string("w_65_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(402834240)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_69_cast_fp16 = mul(x = w_65_to_fp16, y = hidden_states_81_cast_fp16)[name = string("obj_69_cast_fp16")];
+            string query_49_pad_type_0 = const()[name = string("query_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_49_strides_0 = const()[name = string("query_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_49_pad_0 = const()[name = string("query_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_49_dilations_0 = const()[name = string("query_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_49_groups_0 = const()[name = string("query_49_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_8_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(402838400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(407032768))))[name = string("layers_8_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_49_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_49_dilations_0, groups = query_49_groups_0, pad = query_49_pad_0, pad_type = query_49_pad_type_0, strides = query_49_strides_0, weight = layers_8_self_attn_q_proj_weight_to_fp16_palettized, x = obj_69_cast_fp16)[name = string("query_49_cast_fp16")];
+            string current_key_33_pad_type_0 = const()[name = string("current_key_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_33_strides_0 = const()[name = string("current_key_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_33_pad_0 = const()[name = string("current_key_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_33_dilations_0 = const()[name = string("current_key_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_33_groups_0 = const()[name = string("current_key_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_8_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(407033344))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409130560))))[name = string("layers_8_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_33_cast_fp16 = conv(dilations = current_key_33_dilations_0, groups = current_key_33_groups_0, pad = current_key_33_pad_0, pad_type = current_key_33_pad_type_0, strides = current_key_33_strides_0, weight = layers_8_self_attn_k_proj_weight_to_fp16_palettized, x = obj_69_cast_fp16)[name = string("current_key_33_cast_fp16")];
+            string current_value_17_pad_type_0 = const()[name = string("current_value_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_17_strides_0 = const()[name = string("current_value_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_17_pad_0 = const()[name = string("current_value_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_17_dilations_0 = const()[name = string("current_value_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_17_groups_0 = const()[name = string("current_value_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_8_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409131136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(411228352))))[name = string("layers_8_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_17_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_17_dilations_0, groups = current_value_17_groups_0, pad = current_value_17_pad_0, pad_type = current_value_17_pad_type_0, strides = current_value_17_strides_0, weight = layers_8_self_attn_v_proj_weight_to_fp16_palettized, x = obj_69_cast_fp16)[name = string("current_value_17_cast_fp16")];
+            tensor<int32, [4]> var_3327 = const()[name = string("op_3327"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_65_cast_fp16 = reshape(shape = var_3327, x = query_49_cast_fp16)[name = string("inputs_65_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_67_cast_fp16 = mul(x = inputs_65_cast_fp16, y = inputs_65_cast_fp16)[name = string("inputs_sq_67_cast_fp16")];
+            tensor<int32, [1]> variance_67_axes_0 = const()[name = string("variance_67_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_67_keep_dims_0 = const()[name = string("variance_67_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_67_cast_fp16 = reduce_mean(axes = variance_67_axes_0, keep_dims = variance_67_keep_dims_0, x = inputs_sq_67_cast_fp16)[name = string("variance_67_cast_fp16")];
+            fp16 var_3333_to_fp16 = const()[name = string("op_3333_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_3334_cast_fp16 = add(x = variance_67_cast_fp16, y = var_3333_to_fp16)[name = string("op_3334_cast_fp16")];
+            fp32 var_3335_epsilon_0 = const()[name = string("op_3335_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_3335_cast_fp16 = rsqrt(epsilon = var_3335_epsilon_0, x = var_3334_cast_fp16)[name = string("op_3335_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_83_cast_fp16 = mul(x = inputs_65_cast_fp16, y = var_3335_cast_fp16)[name = string("hidden_states_83_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_67_to_fp16 = const()[name = string("w_67_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(411228928)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_17_cast_fp16 = mul(x = w_67_to_fp16, y = hidden_states_83_cast_fp16)[name = string("query_normed_17_cast_fp16")];
+            tensor<int32, [4]> var_3343 = const()[name = string("op_3343"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_67_cast_fp16 = reshape(shape = var_3343, x = current_key_33_cast_fp16)[name = string("inputs_67_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_69_cast_fp16 = mul(x = inputs_67_cast_fp16, y = inputs_67_cast_fp16)[name = string("inputs_sq_69_cast_fp16")];
+            tensor<int32, [1]> variance_69_axes_0 = const()[name = string("variance_69_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_69_keep_dims_0 = const()[name = string("variance_69_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_69_cast_fp16 = reduce_mean(axes = variance_69_axes_0, keep_dims = variance_69_keep_dims_0, x = inputs_sq_69_cast_fp16)[name = string("variance_69_cast_fp16")];
+            fp16 var_3349_to_fp16 = const()[name = string("op_3349_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_3350_cast_fp16 = add(x = variance_69_cast_fp16, y = var_3349_to_fp16)[name = string("op_3350_cast_fp16")];
+            fp32 var_3351_epsilon_0 = const()[name = string("op_3351_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_3351_cast_fp16 = rsqrt(epsilon = var_3351_epsilon_0, x = var_3350_cast_fp16)[name = string("op_3351_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_85_cast_fp16 = mul(x = inputs_67_cast_fp16, y = var_3351_cast_fp16)[name = string("hidden_states_85_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_69_to_fp16 = const()[name = string("w_69_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(411229248)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_17_cast_fp16 = mul(x = w_69_to_fp16, y = hidden_states_85_cast_fp16)[name = string("current_key_normed_17_cast_fp16")];
+            tensor<int32, [4]> var_3369 = const()[name = string("op_3369"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_49_cast_fp16 = reshape(shape = var_3369, x = query_normed_17_cast_fp16)[name = string("mh_q_49_cast_fp16")];
+            tensor<int32, [4]> var_3371 = const()[name = string("op_3371"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_33_cast_fp16 = reshape(shape = var_3371, x = current_key_normed_17_cast_fp16)[name = string("mh_k_33_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3375_cast_fp16 = mul(x = mh_q_49_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3375_cast_fp16")];
+            tensor<int32, [4]> var_3380_begin_0 = const()[name = string("op_3380_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3380_end_0 = const()[name = string("op_3380_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_3380_end_mask_0 = const()[name = string("op_3380_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3380_cast_fp16 = slice_by_index(begin = var_3380_begin_0, end = var_3380_end_0, end_mask = var_3380_end_mask_0, x = mh_q_49_cast_fp16)[name = string("op_3380_cast_fp16")];
+            tensor<int32, [4]> var_3386_begin_0 = const()[name = string("op_3386_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3386_end_0 = const()[name = string("op_3386_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_3386_end_mask_0 = const()[name = string("op_3386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3386_cast_fp16 = slice_by_index(begin = var_3386_begin_0, end = var_3386_end_0, end_mask = var_3386_end_mask_0, x = mh_q_49_cast_fp16)[name = string("op_3386_cast_fp16")];
+            fp16 const_201_promoted_to_fp16 = const()[name = string("const_201_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_3388_cast_fp16 = mul(x = var_3386_cast_fp16, y = const_201_promoted_to_fp16)[name = string("op_3388_cast_fp16")];
+            bool var_3390_interleave_0 = const()[name = string("op_3390_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_3390_cast_fp16 = concat(axis = var_3268, interleave = var_3390_interleave_0, values = (var_3388_cast_fp16, var_3380_cast_fp16))[name = string("op_3390_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3391_cast_fp16 = mul(x = var_3390_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3391_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_51_cast_fp16 = add(x = var_3375_cast_fp16, y = var_3391_cast_fp16)[name = string("mh_q_51_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3393_cast_fp16 = mul(x = mh_k_33_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3393_cast_fp16")];
+            tensor<int32, [4]> var_3398_begin_0 = const()[name = string("op_3398_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3398_end_0 = const()[name = string("op_3398_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_3398_end_mask_0 = const()[name = string("op_3398_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3398_cast_fp16 = slice_by_index(begin = var_3398_begin_0, end = var_3398_end_0, end_mask = var_3398_end_mask_0, x = mh_k_33_cast_fp16)[name = string("op_3398_cast_fp16")];
+            tensor<int32, [4]> var_3404_begin_0 = const()[name = string("op_3404_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3404_end_0 = const()[name = string("op_3404_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_3404_end_mask_0 = const()[name = string("op_3404_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3404_cast_fp16 = slice_by_index(begin = var_3404_begin_0, end = var_3404_end_0, end_mask = var_3404_end_mask_0, x = mh_k_33_cast_fp16)[name = string("op_3404_cast_fp16")];
+            fp16 const_204_promoted_to_fp16 = const()[name = string("const_204_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_3406_cast_fp16 = mul(x = var_3404_cast_fp16, y = const_204_promoted_to_fp16)[name = string("op_3406_cast_fp16")];
+            bool var_3408_interleave_0 = const()[name = string("op_3408_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_3408_cast_fp16 = concat(axis = var_3268, interleave = var_3408_interleave_0, values = (var_3406_cast_fp16, var_3398_cast_fp16))[name = string("op_3408_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3409_cast_fp16 = mul(x = var_3408_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3409_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_35_cast_fp16 = add(x = var_3393_cast_fp16, y = var_3409_cast_fp16)[name = string("mh_k_35_cast_fp16")];
+            tensor<int32, [4]> var_3413 = const()[name = string("op_3413"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_35_cast_fp16 = reshape(shape = var_3413, x = mh_k_35_cast_fp16)[name = string("current_key_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3420_cast_fp16 = mul(x = var_101_cast_fp16_8, y = var_323_cast_fp16)[name = string("op_3420_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3421_cast_fp16 = mul(x = current_key_35_cast_fp16, y = var_321_cast_fp16)[name = string("op_3421_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_51_cast_fp16 = add(x = var_3420_cast_fp16, y = var_3421_cast_fp16)[name = string("key_51_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3424_cast_fp16 = mul(x = var_132_cast_fp16_8, y = var_323_cast_fp16)[name = string("op_3424_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3425_cast_fp16 = mul(x = current_value_17_cast_fp16, y = var_321_cast_fp16)[name = string("op_3425_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_33_cast_fp16 = add(x = var_3424_cast_fp16, y = var_3425_cast_fp16)[name = string("value_33_cast_fp16")];
+            tensor<int32, [4]> var_3429 = const()[name = string("op_3429"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_33_cast_fp16 = reshape(shape = var_3429, x = key_51_cast_fp16)[name = string("key_heads_33_cast_fp16")];
+            tensor<int32, [4]> var_3431 = const()[name = string("op_3431"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_33_cast_fp16 = reshape(shape = var_3431, x = value_33_cast_fp16)[name = string("value_heads_33_cast_fp16")];
+            tensor<int32, [4]> var_3434_begin_0 = const()[name = string("op_3434_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3434_end_0 = const()[name = string("op_3434_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3434_end_mask_0 = const()[name = string("op_3434_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3434_cast_fp16 = slice_by_index(begin = var_3434_begin_0, end = var_3434_end_0, end_mask = var_3434_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3434_cast_fp16")];
+            tensor<int32, [4]> var_3438_begin_0 = const()[name = string("op_3438_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3438_end_0 = const()[name = string("op_3438_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3438_end_mask_0 = const()[name = string("op_3438_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3438_cast_fp16 = slice_by_index(begin = var_3438_begin_0, end = var_3438_end_0, end_mask = var_3438_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3438_cast_fp16")];
+            tensor<int32, [4]> var_3450_begin_0 = const()[name = string("op_3450_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3450_end_0 = const()[name = string("op_3450_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3450_end_mask_0 = const()[name = string("op_3450_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3450_cast_fp16 = slice_by_index(begin = var_3450_begin_0, end = var_3450_end_0, end_mask = var_3450_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3450_cast_fp16")];
+            tensor<int32, [4]> var_3454_begin_0 = const()[name = string("op_3454_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3454_end_0 = const()[name = string("op_3454_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3454_end_mask_0 = const()[name = string("op_3454_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3454_cast_fp16 = slice_by_index(begin = var_3454_begin_0, end = var_3454_end_0, end_mask = var_3454_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3454_cast_fp16")];
+            tensor<int32, [4]> var_3466_begin_0 = const()[name = string("op_3466_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3466_end_0 = const()[name = string("op_3466_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3466_end_mask_0 = const()[name = string("op_3466_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3466_cast_fp16 = slice_by_index(begin = var_3466_begin_0, end = var_3466_end_0, end_mask = var_3466_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3466_cast_fp16")];
+            tensor<int32, [4]> var_3470_begin_0 = const()[name = string("op_3470_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3470_end_0 = const()[name = string("op_3470_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3470_end_mask_0 = const()[name = string("op_3470_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3470_cast_fp16 = slice_by_index(begin = var_3470_begin_0, end = var_3470_end_0, end_mask = var_3470_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3470_cast_fp16")];
+            tensor<int32, [4]> var_3482_begin_0 = const()[name = string("op_3482_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3482_end_0 = const()[name = string("op_3482_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3482_end_mask_0 = const()[name = string("op_3482_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3482_cast_fp16 = slice_by_index(begin = var_3482_begin_0, end = var_3482_end_0, end_mask = var_3482_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3482_cast_fp16")];
+            tensor<int32, [4]> var_3486_begin_0 = const()[name = string("op_3486_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3486_end_0 = const()[name = string("op_3486_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3486_end_mask_0 = const()[name = string("op_3486_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3486_cast_fp16 = slice_by_index(begin = var_3486_begin_0, end = var_3486_end_0, end_mask = var_3486_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3486_cast_fp16")];
+            tensor<int32, [4]> var_3498_begin_0 = const()[name = string("op_3498_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3498_end_0 = const()[name = string("op_3498_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3498_end_mask_0 = const()[name = string("op_3498_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3498_cast_fp16 = slice_by_index(begin = var_3498_begin_0, end = var_3498_end_0, end_mask = var_3498_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3498_cast_fp16")];
+            tensor<int32, [4]> var_3502_begin_0 = const()[name = string("op_3502_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3502_end_0 = const()[name = string("op_3502_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3502_end_mask_0 = const()[name = string("op_3502_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3502_cast_fp16 = slice_by_index(begin = var_3502_begin_0, end = var_3502_end_0, end_mask = var_3502_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3502_cast_fp16")];
+            tensor<int32, [4]> var_3514_begin_0 = const()[name = string("op_3514_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3514_end_0 = const()[name = string("op_3514_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3514_end_mask_0 = const()[name = string("op_3514_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3514_cast_fp16 = slice_by_index(begin = var_3514_begin_0, end = var_3514_end_0, end_mask = var_3514_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3514_cast_fp16")];
+            tensor<int32, [4]> var_3518_begin_0 = const()[name = string("op_3518_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3518_end_0 = const()[name = string("op_3518_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3518_end_mask_0 = const()[name = string("op_3518_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3518_cast_fp16 = slice_by_index(begin = var_3518_begin_0, end = var_3518_end_0, end_mask = var_3518_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3518_cast_fp16")];
+            tensor<int32, [4]> var_3530_begin_0 = const()[name = string("op_3530_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3530_end_0 = const()[name = string("op_3530_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3530_end_mask_0 = const()[name = string("op_3530_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3530_cast_fp16 = slice_by_index(begin = var_3530_begin_0, end = var_3530_end_0, end_mask = var_3530_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3530_cast_fp16")];
+            tensor<int32, [4]> var_3534_begin_0 = const()[name = string("op_3534_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3534_end_0 = const()[name = string("op_3534_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3534_end_mask_0 = const()[name = string("op_3534_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3534_cast_fp16 = slice_by_index(begin = var_3534_begin_0, end = var_3534_end_0, end_mask = var_3534_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3534_cast_fp16")];
+            tensor<int32, [4]> var_3546_begin_0 = const()[name = string("op_3546_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3546_end_0 = const()[name = string("op_3546_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3546_end_mask_0 = const()[name = string("op_3546_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3546_cast_fp16 = slice_by_index(begin = var_3546_begin_0, end = var_3546_end_0, end_mask = var_3546_end_mask_0, x = key_heads_33_cast_fp16)[name = string("op_3546_cast_fp16")];
+            tensor<int32, [4]> var_3550_begin_0 = const()[name = string("op_3550_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3550_end_0 = const()[name = string("op_3550_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3550_end_mask_0 = const()[name = string("op_3550_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3550_cast_fp16 = slice_by_index(begin = var_3550_begin_0, end = var_3550_end_0, end_mask = var_3550_end_mask_0, x = value_heads_33_cast_fp16)[name = string("op_3550_cast_fp16")];
+            bool key_heads_35_interleave_0 = const()[name = string("key_heads_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_35_cast_fp16 = concat(axis = var_3276, interleave = key_heads_35_interleave_0, values = (var_3434_cast_fp16, var_3434_cast_fp16, var_3450_cast_fp16, var_3450_cast_fp16, var_3466_cast_fp16, var_3466_cast_fp16, var_3482_cast_fp16, var_3482_cast_fp16, var_3498_cast_fp16, var_3498_cast_fp16, var_3514_cast_fp16, var_3514_cast_fp16, var_3530_cast_fp16, var_3530_cast_fp16, var_3546_cast_fp16, var_3546_cast_fp16))[name = string("key_heads_35_cast_fp16")];
+            bool value_heads_35_interleave_0 = const()[name = string("value_heads_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_35_cast_fp16 = concat(axis = var_3276, interleave = value_heads_35_interleave_0, values = (var_3438_cast_fp16, var_3438_cast_fp16, var_3454_cast_fp16, var_3454_cast_fp16, var_3470_cast_fp16, var_3470_cast_fp16, var_3486_cast_fp16, var_3486_cast_fp16, var_3502_cast_fp16, var_3502_cast_fp16, var_3518_cast_fp16, var_3518_cast_fp16, var_3534_cast_fp16, var_3534_cast_fp16, var_3550_cast_fp16, var_3550_cast_fp16))[name = string("value_heads_35_cast_fp16")];
+            fp16 var_3573_to_fp16 = const()[name = string("op_3573_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_3574_cast_fp16 = mul(x = mh_q_51_cast_fp16, y = var_3573_to_fp16)[name = string("op_3574_cast_fp16")];
+            bool mh_w_33_transpose_x_0 = const()[name = string("mh_w_33_transpose_x_0"), val = bool(true)];
+            bool mh_w_33_transpose_y_0 = const()[name = string("mh_w_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_33_cast_fp16 = matmul(transpose_x = mh_w_33_transpose_x_0, transpose_y = mh_w_33_transpose_y_0, x = var_3574_cast_fp16, y = key_heads_35_cast_fp16)[name = string("mh_w_33_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_35_cast_fp16 = add(x = mh_w_33_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_35_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_3586_cast_fp16 = softmax(axis = var_3258, x = mh_w_35_cast_fp16)[name = string("op_3586_cast_fp16")];
+            bool attn_17_transpose_x_0 = const()[name = string("attn_17_transpose_x_0"), val = bool(false)];
+            bool attn_17_transpose_y_0 = const()[name = string("attn_17_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_17_cast_fp16 = matmul(transpose_x = attn_17_transpose_x_0, transpose_y = attn_17_transpose_y_0, x = value_heads_35_cast_fp16, y = var_3586_cast_fp16)[name = string("attn_17_cast_fp16")];
+            tensor<int32, [4]> var_3591 = const()[name = string("op_3591"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_65_cast_fp16 = reshape(shape = var_3591, x = attn_17_cast_fp16)[name = string("input_65_cast_fp16")];
+            string obj_75_pad_type_0 = const()[name = string("obj_75_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_75_strides_0 = const()[name = string("obj_75_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_75_pad_0 = const()[name = string("obj_75_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_75_dilations_0 = const()[name = string("obj_75_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_75_groups_0 = const()[name = string("obj_75_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_8_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(411229568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(415423936))))[name = string("layers_8_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_75_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_75_dilations_0, groups = obj_75_groups_0, pad = obj_75_pad_0, pad_type = obj_75_pad_type_0, strides = obj_75_strides_0, weight = layers_8_self_attn_o_proj_weight_to_fp16_palettized, x = input_65_cast_fp16)[name = string("obj_75_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_69_cast_fp16 = add(x = inputs_63_cast_fp16, y = obj_75_cast_fp16)[name = string("inputs_69_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_71_cast_fp16 = mul(x = inputs_69_cast_fp16, y = inputs_69_cast_fp16)[name = string("inputs_sq_71_cast_fp16")];
+            tensor<int32, [1]> variance_71_axes_0 = const()[name = string("variance_71_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_71_keep_dims_0 = const()[name = string("variance_71_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_71_cast_fp16 = reduce_mean(axes = variance_71_axes_0, keep_dims = variance_71_keep_dims_0, x = inputs_sq_71_cast_fp16)[name = string("variance_71_cast_fp16")];
+            fp16 var_3609_to_fp16 = const()[name = string("op_3609_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3610_cast_fp16 = add(x = variance_71_cast_fp16, y = var_3609_to_fp16)[name = string("op_3610_cast_fp16")];
+            fp32 var_3611_epsilon_0 = const()[name = string("op_3611_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3611_cast_fp16 = rsqrt(epsilon = var_3611_epsilon_0, x = var_3610_cast_fp16)[name = string("op_3611_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_87_cast_fp16 = mul(x = inputs_69_cast_fp16, y = var_3611_cast_fp16)[name = string("hidden_states_87_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_71_to_fp16 = const()[name = string("w_71_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(415424512)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_67_cast_fp16 = mul(x = w_71_to_fp16, y = hidden_states_87_cast_fp16)[name = string("input_67_cast_fp16")];
+            string input_69_pad_type_0 = const()[name = string("input_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_69_strides_0 = const()[name = string("input_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_69_pad_0 = const()[name = string("input_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_69_dilations_0 = const()[name = string("input_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_69_groups_0 = const()[name = string("input_69_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_8_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(415428672))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(428011648))))[name = string("layers_8_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_69_cast_fp16 = conv(dilations = input_69_dilations_0, groups = input_69_groups_0, pad = input_69_pad_0, pad_type = input_69_pad_type_0, strides = input_69_strides_0, weight = layers_8_mlp_gate_proj_weight_to_fp16_palettized, x = input_67_cast_fp16)[name = string("input_69_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_3625_cast_fp16 = silu(x = input_69_cast_fp16)[name = string("op_3625_cast_fp16")];
+            string var_3631_pad_type_0 = const()[name = string("op_3631_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3631_strides_0 = const()[name = string("op_3631_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3631_pad_0 = const()[name = string("op_3631_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3631_dilations_0 = const()[name = string("op_3631_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3631_groups_0 = const()[name = string("op_3631_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_8_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(428012224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440595200))))[name = string("layers_8_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_3631_cast_fp16 = conv(dilations = var_3631_dilations_0, groups = var_3631_groups_0, pad = var_3631_pad_0, pad_type = var_3631_pad_type_0, strides = var_3631_strides_0, weight = layers_8_mlp_up_proj_weight_to_fp16_palettized, x = input_67_cast_fp16)[name = string("op_3631_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_71_cast_fp16 = mul(x = var_3625_cast_fp16, y = var_3631_cast_fp16)[name = string("input_71_cast_fp16")];
+            string hidden_states_89_pad_type_0 = const()[name = string("hidden_states_89_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_89_strides_0 = const()[name = string("hidden_states_89_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_89_pad_0 = const()[name = string("hidden_states_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_89_dilations_0 = const()[name = string("hidden_states_89_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_89_groups_0 = const()[name = string("hidden_states_89_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_8_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440595776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453178752))))[name = string("layers_8_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_89_cast_fp16 = conv(dilations = hidden_states_89_dilations_0, groups = hidden_states_89_groups_0, pad = hidden_states_89_pad_0, pad_type = hidden_states_89_pad_type_0, strides = hidden_states_89_strides_0, weight = layers_8_mlp_down_proj_weight_to_fp16_palettized, x = input_71_cast_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_71_cast_fp16 = add(x = inputs_69_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("inputs_71_cast_fp16")];
+            int32 var_3645 = const()[name = string("op_3645"), val = int32(3)];
+            int32 var_3655 = const()[name = string("op_3655"), val = int32(-2)];
+            int32 var_3663 = const()[name = string("op_3663"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_73_cast_fp16 = mul(x = inputs_71_cast_fp16, y = inputs_71_cast_fp16)[name = string("inputs_sq_73_cast_fp16")];
+            tensor<int32, [1]> variance_73_axes_0 = const()[name = string("variance_73_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_73_keep_dims_0 = const()[name = string("variance_73_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_73_cast_fp16 = reduce_mean(axes = variance_73_axes_0, keep_dims = variance_73_keep_dims_0, x = inputs_sq_73_cast_fp16)[name = string("variance_73_cast_fp16")];
+            fp16 var_3675_to_fp16 = const()[name = string("op_3675_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3676_cast_fp16 = add(x = variance_73_cast_fp16, y = var_3675_to_fp16)[name = string("op_3676_cast_fp16")];
+            fp32 var_3677_epsilon_0 = const()[name = string("op_3677_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3677_cast_fp16 = rsqrt(epsilon = var_3677_epsilon_0, x = var_3676_cast_fp16)[name = string("op_3677_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_91_cast_fp16 = mul(x = inputs_71_cast_fp16, y = var_3677_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_73_to_fp16 = const()[name = string("w_73_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453179328)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_77_cast_fp16 = mul(x = w_73_to_fp16, y = hidden_states_91_cast_fp16)[name = string("obj_77_cast_fp16")];
+            string query_55_pad_type_0 = const()[name = string("query_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_55_strides_0 = const()[name = string("query_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_55_pad_0 = const()[name = string("query_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_55_dilations_0 = const()[name = string("query_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_55_groups_0 = const()[name = string("query_55_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_9_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453183488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457377856))))[name = string("layers_9_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_55_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_55_dilations_0, groups = query_55_groups_0, pad = query_55_pad_0, pad_type = query_55_pad_type_0, strides = query_55_strides_0, weight = layers_9_self_attn_q_proj_weight_to_fp16_palettized, x = obj_77_cast_fp16)[name = string("query_55_cast_fp16")];
+            string current_key_37_pad_type_0 = const()[name = string("current_key_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_37_strides_0 = const()[name = string("current_key_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_37_pad_0 = const()[name = string("current_key_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_37_dilations_0 = const()[name = string("current_key_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_37_groups_0 = const()[name = string("current_key_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_9_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457378432))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(459475648))))[name = string("layers_9_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_37_cast_fp16 = conv(dilations = current_key_37_dilations_0, groups = current_key_37_groups_0, pad = current_key_37_pad_0, pad_type = current_key_37_pad_type_0, strides = current_key_37_strides_0, weight = layers_9_self_attn_k_proj_weight_to_fp16_palettized, x = obj_77_cast_fp16)[name = string("current_key_37_cast_fp16")];
+            string current_value_19_pad_type_0 = const()[name = string("current_value_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_19_strides_0 = const()[name = string("current_value_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_19_pad_0 = const()[name = string("current_value_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_19_dilations_0 = const()[name = string("current_value_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_19_groups_0 = const()[name = string("current_value_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_9_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(459476224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(461573440))))[name = string("layers_9_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_19_dilations_0, groups = current_value_19_groups_0, pad = current_value_19_pad_0, pad_type = current_value_19_pad_type_0, strides = current_value_19_strides_0, weight = layers_9_self_attn_v_proj_weight_to_fp16_palettized, x = obj_77_cast_fp16)[name = string("current_value_19_cast_fp16")];
+            tensor<int32, [4]> var_3714 = const()[name = string("op_3714"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_73_cast_fp16 = reshape(shape = var_3714, x = query_55_cast_fp16)[name = string("inputs_73_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_75_cast_fp16 = mul(x = inputs_73_cast_fp16, y = inputs_73_cast_fp16)[name = string("inputs_sq_75_cast_fp16")];
+            tensor<int32, [1]> variance_75_axes_0 = const()[name = string("variance_75_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_75_keep_dims_0 = const()[name = string("variance_75_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_75_cast_fp16 = reduce_mean(axes = variance_75_axes_0, keep_dims = variance_75_keep_dims_0, x = inputs_sq_75_cast_fp16)[name = string("variance_75_cast_fp16")];
+            fp16 var_3720_to_fp16 = const()[name = string("op_3720_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_3721_cast_fp16 = add(x = variance_75_cast_fp16, y = var_3720_to_fp16)[name = string("op_3721_cast_fp16")];
+            fp32 var_3722_epsilon_0 = const()[name = string("op_3722_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_3722_cast_fp16 = rsqrt(epsilon = var_3722_epsilon_0, x = var_3721_cast_fp16)[name = string("op_3722_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_93_cast_fp16 = mul(x = inputs_73_cast_fp16, y = var_3722_cast_fp16)[name = string("hidden_states_93_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_75_to_fp16 = const()[name = string("w_75_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(461574016)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_19_cast_fp16 = mul(x = w_75_to_fp16, y = hidden_states_93_cast_fp16)[name = string("query_normed_19_cast_fp16")];
+            tensor<int32, [4]> var_3730 = const()[name = string("op_3730"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_75_cast_fp16 = reshape(shape = var_3730, x = current_key_37_cast_fp16)[name = string("inputs_75_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_77_cast_fp16 = mul(x = inputs_75_cast_fp16, y = inputs_75_cast_fp16)[name = string("inputs_sq_77_cast_fp16")];
+            tensor<int32, [1]> variance_77_axes_0 = const()[name = string("variance_77_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_77_keep_dims_0 = const()[name = string("variance_77_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_77_cast_fp16 = reduce_mean(axes = variance_77_axes_0, keep_dims = variance_77_keep_dims_0, x = inputs_sq_77_cast_fp16)[name = string("variance_77_cast_fp16")];
+            fp16 var_3736_to_fp16 = const()[name = string("op_3736_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_3737_cast_fp16 = add(x = variance_77_cast_fp16, y = var_3736_to_fp16)[name = string("op_3737_cast_fp16")];
+            fp32 var_3738_epsilon_0 = const()[name = string("op_3738_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_3738_cast_fp16 = rsqrt(epsilon = var_3738_epsilon_0, x = var_3737_cast_fp16)[name = string("op_3738_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_95_cast_fp16 = mul(x = inputs_75_cast_fp16, y = var_3738_cast_fp16)[name = string("hidden_states_95_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_77_to_fp16 = const()[name = string("w_77_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(461574336)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_19_cast_fp16 = mul(x = w_77_to_fp16, y = hidden_states_95_cast_fp16)[name = string("current_key_normed_19_cast_fp16")];
+            tensor<int32, [4]> var_3756 = const()[name = string("op_3756"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_55_cast_fp16 = reshape(shape = var_3756, x = query_normed_19_cast_fp16)[name = string("mh_q_55_cast_fp16")];
+            tensor<int32, [4]> var_3758 = const()[name = string("op_3758"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_37_cast_fp16 = reshape(shape = var_3758, x = current_key_normed_19_cast_fp16)[name = string("mh_k_37_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3762_cast_fp16 = mul(x = mh_q_55_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3762_cast_fp16")];
+            tensor<int32, [4]> var_3767_begin_0 = const()[name = string("op_3767_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3767_end_0 = const()[name = string("op_3767_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_3767_end_mask_0 = const()[name = string("op_3767_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3767_cast_fp16 = slice_by_index(begin = var_3767_begin_0, end = var_3767_end_0, end_mask = var_3767_end_mask_0, x = mh_q_55_cast_fp16)[name = string("op_3767_cast_fp16")];
+            tensor<int32, [4]> var_3773_begin_0 = const()[name = string("op_3773_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3773_end_0 = const()[name = string("op_3773_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_3773_end_mask_0 = const()[name = string("op_3773_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_3773_cast_fp16 = slice_by_index(begin = var_3773_begin_0, end = var_3773_end_0, end_mask = var_3773_end_mask_0, x = mh_q_55_cast_fp16)[name = string("op_3773_cast_fp16")];
+            fp16 const_224_promoted_to_fp16 = const()[name = string("const_224_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_3775_cast_fp16 = mul(x = var_3773_cast_fp16, y = const_224_promoted_to_fp16)[name = string("op_3775_cast_fp16")];
+            bool var_3777_interleave_0 = const()[name = string("op_3777_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_3777_cast_fp16 = concat(axis = var_3655, interleave = var_3777_interleave_0, values = (var_3775_cast_fp16, var_3767_cast_fp16))[name = string("op_3777_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_3778_cast_fp16 = mul(x = var_3777_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3778_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_57_cast_fp16 = add(x = var_3762_cast_fp16, y = var_3778_cast_fp16)[name = string("mh_q_57_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3780_cast_fp16 = mul(x = mh_k_37_cast_fp16, y = cos_1_cast_fp16)[name = string("op_3780_cast_fp16")];
+            tensor<int32, [4]> var_3785_begin_0 = const()[name = string("op_3785_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3785_end_0 = const()[name = string("op_3785_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_3785_end_mask_0 = const()[name = string("op_3785_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3785_cast_fp16 = slice_by_index(begin = var_3785_begin_0, end = var_3785_end_0, end_mask = var_3785_end_mask_0, x = mh_k_37_cast_fp16)[name = string("op_3785_cast_fp16")];
+            tensor<int32, [4]> var_3791_begin_0 = const()[name = string("op_3791_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_3791_end_0 = const()[name = string("op_3791_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_3791_end_mask_0 = const()[name = string("op_3791_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_3791_cast_fp16 = slice_by_index(begin = var_3791_begin_0, end = var_3791_end_0, end_mask = var_3791_end_mask_0, x = mh_k_37_cast_fp16)[name = string("op_3791_cast_fp16")];
+            fp16 const_227_promoted_to_fp16 = const()[name = string("const_227_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_3793_cast_fp16 = mul(x = var_3791_cast_fp16, y = const_227_promoted_to_fp16)[name = string("op_3793_cast_fp16")];
+            bool var_3795_interleave_0 = const()[name = string("op_3795_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_3795_cast_fp16 = concat(axis = var_3655, interleave = var_3795_interleave_0, values = (var_3793_cast_fp16, var_3785_cast_fp16))[name = string("op_3795_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_3796_cast_fp16 = mul(x = var_3795_cast_fp16, y = sin_1_cast_fp16)[name = string("op_3796_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_39_cast_fp16 = add(x = var_3780_cast_fp16, y = var_3796_cast_fp16)[name = string("mh_k_39_cast_fp16")];
+            tensor<int32, [4]> var_3800 = const()[name = string("op_3800"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_39_cast_fp16 = reshape(shape = var_3800, x = mh_k_39_cast_fp16)[name = string("current_key_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3807_cast_fp16 = mul(x = var_101_cast_fp16_9, y = var_323_cast_fp16)[name = string("op_3807_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3808_cast_fp16 = mul(x = current_key_39_cast_fp16, y = var_321_cast_fp16)[name = string("op_3808_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_57_cast_fp16 = add(x = var_3807_cast_fp16, y = var_3808_cast_fp16)[name = string("key_57_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3811_cast_fp16 = mul(x = var_132_cast_fp16_9, y = var_323_cast_fp16)[name = string("op_3811_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_3812_cast_fp16 = mul(x = current_value_19_cast_fp16, y = var_321_cast_fp16)[name = string("op_3812_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_37_cast_fp16 = add(x = var_3811_cast_fp16, y = var_3812_cast_fp16)[name = string("value_37_cast_fp16")];
+            tensor<int32, [4]> var_3816 = const()[name = string("op_3816"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_37_cast_fp16 = reshape(shape = var_3816, x = key_57_cast_fp16)[name = string("key_heads_37_cast_fp16")];
+            tensor<int32, [4]> var_3818 = const()[name = string("op_3818"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_37_cast_fp16 = reshape(shape = var_3818, x = value_37_cast_fp16)[name = string("value_heads_37_cast_fp16")];
+            tensor<int32, [4]> var_3821_begin_0 = const()[name = string("op_3821_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3821_end_0 = const()[name = string("op_3821_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3821_end_mask_0 = const()[name = string("op_3821_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3821_cast_fp16 = slice_by_index(begin = var_3821_begin_0, end = var_3821_end_0, end_mask = var_3821_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3821_cast_fp16")];
+            tensor<int32, [4]> var_3825_begin_0 = const()[name = string("op_3825_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3825_end_0 = const()[name = string("op_3825_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3825_end_mask_0 = const()[name = string("op_3825_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3825_cast_fp16 = slice_by_index(begin = var_3825_begin_0, end = var_3825_end_0, end_mask = var_3825_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3825_cast_fp16")];
+            tensor<int32, [4]> var_3837_begin_0 = const()[name = string("op_3837_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3837_end_0 = const()[name = string("op_3837_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3837_end_mask_0 = const()[name = string("op_3837_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3837_cast_fp16 = slice_by_index(begin = var_3837_begin_0, end = var_3837_end_0, end_mask = var_3837_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3837_cast_fp16")];
+            tensor<int32, [4]> var_3841_begin_0 = const()[name = string("op_3841_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_3841_end_0 = const()[name = string("op_3841_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_3841_end_mask_0 = const()[name = string("op_3841_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3841_cast_fp16 = slice_by_index(begin = var_3841_begin_0, end = var_3841_end_0, end_mask = var_3841_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3841_cast_fp16")];
+            tensor<int32, [4]> var_3853_begin_0 = const()[name = string("op_3853_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3853_end_0 = const()[name = string("op_3853_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3853_end_mask_0 = const()[name = string("op_3853_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3853_cast_fp16 = slice_by_index(begin = var_3853_begin_0, end = var_3853_end_0, end_mask = var_3853_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3853_cast_fp16")];
+            tensor<int32, [4]> var_3857_begin_0 = const()[name = string("op_3857_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_3857_end_0 = const()[name = string("op_3857_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_3857_end_mask_0 = const()[name = string("op_3857_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3857_cast_fp16 = slice_by_index(begin = var_3857_begin_0, end = var_3857_end_0, end_mask = var_3857_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3857_cast_fp16")];
+            tensor<int32, [4]> var_3869_begin_0 = const()[name = string("op_3869_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3869_end_0 = const()[name = string("op_3869_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3869_end_mask_0 = const()[name = string("op_3869_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3869_cast_fp16 = slice_by_index(begin = var_3869_begin_0, end = var_3869_end_0, end_mask = var_3869_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3869_cast_fp16")];
+            tensor<int32, [4]> var_3873_begin_0 = const()[name = string("op_3873_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_3873_end_0 = const()[name = string("op_3873_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_3873_end_mask_0 = const()[name = string("op_3873_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3873_cast_fp16 = slice_by_index(begin = var_3873_begin_0, end = var_3873_end_0, end_mask = var_3873_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3873_cast_fp16")];
+            tensor<int32, [4]> var_3885_begin_0 = const()[name = string("op_3885_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3885_end_0 = const()[name = string("op_3885_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3885_end_mask_0 = const()[name = string("op_3885_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3885_cast_fp16 = slice_by_index(begin = var_3885_begin_0, end = var_3885_end_0, end_mask = var_3885_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3885_cast_fp16")];
+            tensor<int32, [4]> var_3889_begin_0 = const()[name = string("op_3889_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_3889_end_0 = const()[name = string("op_3889_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_3889_end_mask_0 = const()[name = string("op_3889_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3889_cast_fp16 = slice_by_index(begin = var_3889_begin_0, end = var_3889_end_0, end_mask = var_3889_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3889_cast_fp16")];
+            tensor<int32, [4]> var_3901_begin_0 = const()[name = string("op_3901_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3901_end_0 = const()[name = string("op_3901_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3901_end_mask_0 = const()[name = string("op_3901_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3901_cast_fp16 = slice_by_index(begin = var_3901_begin_0, end = var_3901_end_0, end_mask = var_3901_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3901_cast_fp16")];
+            tensor<int32, [4]> var_3905_begin_0 = const()[name = string("op_3905_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_3905_end_0 = const()[name = string("op_3905_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_3905_end_mask_0 = const()[name = string("op_3905_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3905_cast_fp16 = slice_by_index(begin = var_3905_begin_0, end = var_3905_end_0, end_mask = var_3905_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3905_cast_fp16")];
+            tensor<int32, [4]> var_3917_begin_0 = const()[name = string("op_3917_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3917_end_0 = const()[name = string("op_3917_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3917_end_mask_0 = const()[name = string("op_3917_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3917_cast_fp16 = slice_by_index(begin = var_3917_begin_0, end = var_3917_end_0, end_mask = var_3917_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3917_cast_fp16")];
+            tensor<int32, [4]> var_3921_begin_0 = const()[name = string("op_3921_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_3921_end_0 = const()[name = string("op_3921_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_3921_end_mask_0 = const()[name = string("op_3921_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3921_cast_fp16 = slice_by_index(begin = var_3921_begin_0, end = var_3921_end_0, end_mask = var_3921_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3921_cast_fp16")];
+            tensor<int32, [4]> var_3933_begin_0 = const()[name = string("op_3933_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3933_end_0 = const()[name = string("op_3933_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3933_end_mask_0 = const()[name = string("op_3933_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3933_cast_fp16 = slice_by_index(begin = var_3933_begin_0, end = var_3933_end_0, end_mask = var_3933_end_mask_0, x = key_heads_37_cast_fp16)[name = string("op_3933_cast_fp16")];
+            tensor<int32, [4]> var_3937_begin_0 = const()[name = string("op_3937_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_3937_end_0 = const()[name = string("op_3937_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_3937_end_mask_0 = const()[name = string("op_3937_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_3937_cast_fp16 = slice_by_index(begin = var_3937_begin_0, end = var_3937_end_0, end_mask = var_3937_end_mask_0, x = value_heads_37_cast_fp16)[name = string("op_3937_cast_fp16")];
+            bool key_heads_39_interleave_0 = const()[name = string("key_heads_39_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_39_cast_fp16 = concat(axis = var_3663, interleave = key_heads_39_interleave_0, values = (var_3821_cast_fp16, var_3821_cast_fp16, var_3837_cast_fp16, var_3837_cast_fp16, var_3853_cast_fp16, var_3853_cast_fp16, var_3869_cast_fp16, var_3869_cast_fp16, var_3885_cast_fp16, var_3885_cast_fp16, var_3901_cast_fp16, var_3901_cast_fp16, var_3917_cast_fp16, var_3917_cast_fp16, var_3933_cast_fp16, var_3933_cast_fp16))[name = string("key_heads_39_cast_fp16")];
+            bool value_heads_39_interleave_0 = const()[name = string("value_heads_39_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_39_cast_fp16 = concat(axis = var_3663, interleave = value_heads_39_interleave_0, values = (var_3825_cast_fp16, var_3825_cast_fp16, var_3841_cast_fp16, var_3841_cast_fp16, var_3857_cast_fp16, var_3857_cast_fp16, var_3873_cast_fp16, var_3873_cast_fp16, var_3889_cast_fp16, var_3889_cast_fp16, var_3905_cast_fp16, var_3905_cast_fp16, var_3921_cast_fp16, var_3921_cast_fp16, var_3937_cast_fp16, var_3937_cast_fp16))[name = string("value_heads_39_cast_fp16")];
+            fp16 var_3960_to_fp16 = const()[name = string("op_3960_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_3961_cast_fp16 = mul(x = mh_q_57_cast_fp16, y = var_3960_to_fp16)[name = string("op_3961_cast_fp16")];
+            bool mh_w_37_transpose_x_0 = const()[name = string("mh_w_37_transpose_x_0"), val = bool(true)];
+            bool mh_w_37_transpose_y_0 = const()[name = string("mh_w_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_37_cast_fp16 = matmul(transpose_x = mh_w_37_transpose_x_0, transpose_y = mh_w_37_transpose_y_0, x = var_3961_cast_fp16, y = key_heads_39_cast_fp16)[name = string("mh_w_37_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_39_cast_fp16 = add(x = mh_w_37_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_39_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_3973_cast_fp16 = softmax(axis = var_3645, x = mh_w_39_cast_fp16)[name = string("op_3973_cast_fp16")];
+            bool attn_19_transpose_x_0 = const()[name = string("attn_19_transpose_x_0"), val = bool(false)];
+            bool attn_19_transpose_y_0 = const()[name = string("attn_19_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_19_cast_fp16 = matmul(transpose_x = attn_19_transpose_x_0, transpose_y = attn_19_transpose_y_0, x = value_heads_39_cast_fp16, y = var_3973_cast_fp16)[name = string("attn_19_cast_fp16")];
+            tensor<int32, [4]> var_3978 = const()[name = string("op_3978"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_73_cast_fp16 = reshape(shape = var_3978, x = attn_19_cast_fp16)[name = string("input_73_cast_fp16")];
+            string obj_83_pad_type_0 = const()[name = string("obj_83_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_83_strides_0 = const()[name = string("obj_83_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_83_pad_0 = const()[name = string("obj_83_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_83_dilations_0 = const()[name = string("obj_83_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_83_groups_0 = const()[name = string("obj_83_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_9_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(461574656))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(465769024))))[name = string("layers_9_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_83_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_83_dilations_0, groups = obj_83_groups_0, pad = obj_83_pad_0, pad_type = obj_83_pad_type_0, strides = obj_83_strides_0, weight = layers_9_self_attn_o_proj_weight_to_fp16_palettized, x = input_73_cast_fp16)[name = string("obj_83_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_77_cast_fp16 = add(x = inputs_71_cast_fp16, y = obj_83_cast_fp16)[name = string("inputs_77_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_79_cast_fp16 = mul(x = inputs_77_cast_fp16, y = inputs_77_cast_fp16)[name = string("inputs_sq_79_cast_fp16")];
+            tensor<int32, [1]> variance_79_axes_0 = const()[name = string("variance_79_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_79_keep_dims_0 = const()[name = string("variance_79_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_79_cast_fp16 = reduce_mean(axes = variance_79_axes_0, keep_dims = variance_79_keep_dims_0, x = inputs_sq_79_cast_fp16)[name = string("variance_79_cast_fp16")];
+            fp16 var_3996_to_fp16 = const()[name = string("op_3996_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_3997_cast_fp16 = add(x = variance_79_cast_fp16, y = var_3996_to_fp16)[name = string("op_3997_cast_fp16")];
+            fp32 var_3998_epsilon_0 = const()[name = string("op_3998_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_3998_cast_fp16 = rsqrt(epsilon = var_3998_epsilon_0, x = var_3997_cast_fp16)[name = string("op_3998_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_97_cast_fp16 = mul(x = inputs_77_cast_fp16, y = var_3998_cast_fp16)[name = string("hidden_states_97_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_79_to_fp16 = const()[name = string("w_79_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(465769600)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_75_cast_fp16 = mul(x = w_79_to_fp16, y = hidden_states_97_cast_fp16)[name = string("input_75_cast_fp16")];
+            string input_77_pad_type_0 = const()[name = string("input_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_77_strides_0 = const()[name = string("input_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_77_pad_0 = const()[name = string("input_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_77_dilations_0 = const()[name = string("input_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_77_groups_0 = const()[name = string("input_77_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_9_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(465773760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(478356736))))[name = string("layers_9_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_77_cast_fp16 = conv(dilations = input_77_dilations_0, groups = input_77_groups_0, pad = input_77_pad_0, pad_type = input_77_pad_type_0, strides = input_77_strides_0, weight = layers_9_mlp_gate_proj_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = string("input_77_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_4012_cast_fp16 = silu(x = input_77_cast_fp16)[name = string("op_4012_cast_fp16")];
+            string var_4018_pad_type_0 = const()[name = string("op_4018_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4018_strides_0 = const()[name = string("op_4018_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4018_pad_0 = const()[name = string("op_4018_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4018_dilations_0 = const()[name = string("op_4018_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4018_groups_0 = const()[name = string("op_4018_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_9_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(478357312))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490940288))))[name = string("layers_9_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_4018_cast_fp16 = conv(dilations = var_4018_dilations_0, groups = var_4018_groups_0, pad = var_4018_pad_0, pad_type = var_4018_pad_type_0, strides = var_4018_strides_0, weight = layers_9_mlp_up_proj_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = string("op_4018_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_79_cast_fp16 = mul(x = var_4012_cast_fp16, y = var_4018_cast_fp16)[name = string("input_79_cast_fp16")];
+            string hidden_states_99_pad_type_0 = const()[name = string("hidden_states_99_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_99_strides_0 = const()[name = string("hidden_states_99_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_99_pad_0 = const()[name = string("hidden_states_99_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_99_dilations_0 = const()[name = string("hidden_states_99_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_99_groups_0 = const()[name = string("hidden_states_99_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_9_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490940864))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(503523840))))[name = string("layers_9_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_99_cast_fp16 = conv(dilations = hidden_states_99_dilations_0, groups = hidden_states_99_groups_0, pad = hidden_states_99_pad_0, pad_type = hidden_states_99_pad_type_0, strides = hidden_states_99_strides_0, weight = layers_9_mlp_down_proj_weight_to_fp16_palettized, x = input_79_cast_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_79_cast_fp16 = add(x = inputs_77_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("inputs_79_cast_fp16")];
+            int32 var_4032 = const()[name = string("op_4032"), val = int32(3)];
+            int32 var_4042 = const()[name = string("op_4042"), val = int32(-2)];
+            int32 var_4050 = const()[name = string("op_4050"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_81_cast_fp16 = mul(x = inputs_79_cast_fp16, y = inputs_79_cast_fp16)[name = string("inputs_sq_81_cast_fp16")];
+            tensor<int32, [1]> variance_81_axes_0 = const()[name = string("variance_81_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_81_keep_dims_0 = const()[name = string("variance_81_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_81_cast_fp16 = reduce_mean(axes = variance_81_axes_0, keep_dims = variance_81_keep_dims_0, x = inputs_sq_81_cast_fp16)[name = string("variance_81_cast_fp16")];
+            fp16 var_4062_to_fp16 = const()[name = string("op_4062_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4063_cast_fp16 = add(x = variance_81_cast_fp16, y = var_4062_to_fp16)[name = string("op_4063_cast_fp16")];
+            fp32 var_4064_epsilon_0 = const()[name = string("op_4064_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4064_cast_fp16 = rsqrt(epsilon = var_4064_epsilon_0, x = var_4063_cast_fp16)[name = string("op_4064_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_101_cast_fp16 = mul(x = inputs_79_cast_fp16, y = var_4064_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_81_to_fp16 = const()[name = string("w_81_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(503524416)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_85_cast_fp16 = mul(x = w_81_to_fp16, y = hidden_states_101_cast_fp16)[name = string("obj_85_cast_fp16")];
+            string query_61_pad_type_0 = const()[name = string("query_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_61_strides_0 = const()[name = string("query_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_61_pad_0 = const()[name = string("query_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_61_dilations_0 = const()[name = string("query_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_61_groups_0 = const()[name = string("query_61_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_10_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(503528576))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(507722944))))[name = string("layers_10_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_61_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_61_dilations_0, groups = query_61_groups_0, pad = query_61_pad_0, pad_type = query_61_pad_type_0, strides = query_61_strides_0, weight = layers_10_self_attn_q_proj_weight_to_fp16_palettized, x = obj_85_cast_fp16)[name = string("query_61_cast_fp16")];
+            string current_key_41_pad_type_0 = const()[name = string("current_key_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_41_strides_0 = const()[name = string("current_key_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_41_pad_0 = const()[name = string("current_key_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_41_dilations_0 = const()[name = string("current_key_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_41_groups_0 = const()[name = string("current_key_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_10_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(507723520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(509820736))))[name = string("layers_10_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_41_cast_fp16 = conv(dilations = current_key_41_dilations_0, groups = current_key_41_groups_0, pad = current_key_41_pad_0, pad_type = current_key_41_pad_type_0, strides = current_key_41_strides_0, weight = layers_10_self_attn_k_proj_weight_to_fp16_palettized, x = obj_85_cast_fp16)[name = string("current_key_41_cast_fp16")];
+            string current_value_21_pad_type_0 = const()[name = string("current_value_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_21_strides_0 = const()[name = string("current_value_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_21_pad_0 = const()[name = string("current_value_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_21_dilations_0 = const()[name = string("current_value_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_21_groups_0 = const()[name = string("current_value_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_10_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(509821312))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(511918528))))[name = string("layers_10_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_21_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_21_dilations_0, groups = current_value_21_groups_0, pad = current_value_21_pad_0, pad_type = current_value_21_pad_type_0, strides = current_value_21_strides_0, weight = layers_10_self_attn_v_proj_weight_to_fp16_palettized, x = obj_85_cast_fp16)[name = string("current_value_21_cast_fp16")];
+            tensor<int32, [4]> var_4101 = const()[name = string("op_4101"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_81_cast_fp16 = reshape(shape = var_4101, x = query_61_cast_fp16)[name = string("inputs_81_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_83_cast_fp16 = mul(x = inputs_81_cast_fp16, y = inputs_81_cast_fp16)[name = string("inputs_sq_83_cast_fp16")];
+            tensor<int32, [1]> variance_83_axes_0 = const()[name = string("variance_83_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_83_keep_dims_0 = const()[name = string("variance_83_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_83_cast_fp16 = reduce_mean(axes = variance_83_axes_0, keep_dims = variance_83_keep_dims_0, x = inputs_sq_83_cast_fp16)[name = string("variance_83_cast_fp16")];
+            fp16 var_4107_to_fp16 = const()[name = string("op_4107_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_4108_cast_fp16 = add(x = variance_83_cast_fp16, y = var_4107_to_fp16)[name = string("op_4108_cast_fp16")];
+            fp32 var_4109_epsilon_0 = const()[name = string("op_4109_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_4109_cast_fp16 = rsqrt(epsilon = var_4109_epsilon_0, x = var_4108_cast_fp16)[name = string("op_4109_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_103_cast_fp16 = mul(x = inputs_81_cast_fp16, y = var_4109_cast_fp16)[name = string("hidden_states_103_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_83_to_fp16 = const()[name = string("w_83_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(511919104)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_21_cast_fp16 = mul(x = w_83_to_fp16, y = hidden_states_103_cast_fp16)[name = string("query_normed_21_cast_fp16")];
+            tensor<int32, [4]> var_4117 = const()[name = string("op_4117"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_83_cast_fp16 = reshape(shape = var_4117, x = current_key_41_cast_fp16)[name = string("inputs_83_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_85_cast_fp16 = mul(x = inputs_83_cast_fp16, y = inputs_83_cast_fp16)[name = string("inputs_sq_85_cast_fp16")];
+            tensor<int32, [1]> variance_85_axes_0 = const()[name = string("variance_85_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_85_keep_dims_0 = const()[name = string("variance_85_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_85_cast_fp16 = reduce_mean(axes = variance_85_axes_0, keep_dims = variance_85_keep_dims_0, x = inputs_sq_85_cast_fp16)[name = string("variance_85_cast_fp16")];
+            fp16 var_4123_to_fp16 = const()[name = string("op_4123_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_4124_cast_fp16 = add(x = variance_85_cast_fp16, y = var_4123_to_fp16)[name = string("op_4124_cast_fp16")];
+            fp32 var_4125_epsilon_0 = const()[name = string("op_4125_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_4125_cast_fp16 = rsqrt(epsilon = var_4125_epsilon_0, x = var_4124_cast_fp16)[name = string("op_4125_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_105_cast_fp16 = mul(x = inputs_83_cast_fp16, y = var_4125_cast_fp16)[name = string("hidden_states_105_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_85_to_fp16 = const()[name = string("w_85_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(511919424)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_21_cast_fp16 = mul(x = w_85_to_fp16, y = hidden_states_105_cast_fp16)[name = string("current_key_normed_21_cast_fp16")];
+            tensor<int32, [4]> var_4143 = const()[name = string("op_4143"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_61_cast_fp16 = reshape(shape = var_4143, x = query_normed_21_cast_fp16)[name = string("mh_q_61_cast_fp16")];
+            tensor<int32, [4]> var_4145 = const()[name = string("op_4145"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_41_cast_fp16 = reshape(shape = var_4145, x = current_key_normed_21_cast_fp16)[name = string("mh_k_41_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4149_cast_fp16 = mul(x = mh_q_61_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4149_cast_fp16")];
+            tensor<int32, [4]> var_4154_begin_0 = const()[name = string("op_4154_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4154_end_0 = const()[name = string("op_4154_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_4154_end_mask_0 = const()[name = string("op_4154_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4154_cast_fp16 = slice_by_index(begin = var_4154_begin_0, end = var_4154_end_0, end_mask = var_4154_end_mask_0, x = mh_q_61_cast_fp16)[name = string("op_4154_cast_fp16")];
+            tensor<int32, [4]> var_4160_begin_0 = const()[name = string("op_4160_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4160_end_0 = const()[name = string("op_4160_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_4160_end_mask_0 = const()[name = string("op_4160_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4160_cast_fp16 = slice_by_index(begin = var_4160_begin_0, end = var_4160_end_0, end_mask = var_4160_end_mask_0, x = mh_q_61_cast_fp16)[name = string("op_4160_cast_fp16")];
+            fp16 const_247_promoted_to_fp16 = const()[name = string("const_247_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_4162_cast_fp16 = mul(x = var_4160_cast_fp16, y = const_247_promoted_to_fp16)[name = string("op_4162_cast_fp16")];
+            bool var_4164_interleave_0 = const()[name = string("op_4164_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_4164_cast_fp16 = concat(axis = var_4042, interleave = var_4164_interleave_0, values = (var_4162_cast_fp16, var_4154_cast_fp16))[name = string("op_4164_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4165_cast_fp16 = mul(x = var_4164_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4165_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_63_cast_fp16 = add(x = var_4149_cast_fp16, y = var_4165_cast_fp16)[name = string("mh_q_63_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4167_cast_fp16 = mul(x = mh_k_41_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4167_cast_fp16")];
+            tensor<int32, [4]> var_4172_begin_0 = const()[name = string("op_4172_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4172_end_0 = const()[name = string("op_4172_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_4172_end_mask_0 = const()[name = string("op_4172_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4172_cast_fp16 = slice_by_index(begin = var_4172_begin_0, end = var_4172_end_0, end_mask = var_4172_end_mask_0, x = mh_k_41_cast_fp16)[name = string("op_4172_cast_fp16")];
+            tensor<int32, [4]> var_4178_begin_0 = const()[name = string("op_4178_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4178_end_0 = const()[name = string("op_4178_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_4178_end_mask_0 = const()[name = string("op_4178_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4178_cast_fp16 = slice_by_index(begin = var_4178_begin_0, end = var_4178_end_0, end_mask = var_4178_end_mask_0, x = mh_k_41_cast_fp16)[name = string("op_4178_cast_fp16")];
+            fp16 const_250_promoted_to_fp16 = const()[name = string("const_250_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_4180_cast_fp16 = mul(x = var_4178_cast_fp16, y = const_250_promoted_to_fp16)[name = string("op_4180_cast_fp16")];
+            bool var_4182_interleave_0 = const()[name = string("op_4182_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_4182_cast_fp16 = concat(axis = var_4042, interleave = var_4182_interleave_0, values = (var_4180_cast_fp16, var_4172_cast_fp16))[name = string("op_4182_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4183_cast_fp16 = mul(x = var_4182_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4183_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_43_cast_fp16 = add(x = var_4167_cast_fp16, y = var_4183_cast_fp16)[name = string("mh_k_43_cast_fp16")];
+            tensor<int32, [4]> var_4187 = const()[name = string("op_4187"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_43_cast_fp16 = reshape(shape = var_4187, x = mh_k_43_cast_fp16)[name = string("current_key_43_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4194_cast_fp16 = mul(x = var_101_cast_fp16_10, y = var_323_cast_fp16)[name = string("op_4194_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4195_cast_fp16 = mul(x = current_key_43_cast_fp16, y = var_321_cast_fp16)[name = string("op_4195_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_63_cast_fp16 = add(x = var_4194_cast_fp16, y = var_4195_cast_fp16)[name = string("key_63_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4198_cast_fp16 = mul(x = var_132_cast_fp16_10, y = var_323_cast_fp16)[name = string("op_4198_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4199_cast_fp16 = mul(x = current_value_21_cast_fp16, y = var_321_cast_fp16)[name = string("op_4199_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_41_cast_fp16 = add(x = var_4198_cast_fp16, y = var_4199_cast_fp16)[name = string("value_41_cast_fp16")];
+            tensor<int32, [4]> var_4203 = const()[name = string("op_4203"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_41_cast_fp16 = reshape(shape = var_4203, x = key_63_cast_fp16)[name = string("key_heads_41_cast_fp16")];
+            tensor<int32, [4]> var_4205 = const()[name = string("op_4205"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_41_cast_fp16 = reshape(shape = var_4205, x = value_41_cast_fp16)[name = string("value_heads_41_cast_fp16")];
+            tensor<int32, [4]> var_4208_begin_0 = const()[name = string("op_4208_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4208_end_0 = const()[name = string("op_4208_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4208_end_mask_0 = const()[name = string("op_4208_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4208_cast_fp16 = slice_by_index(begin = var_4208_begin_0, end = var_4208_end_0, end_mask = var_4208_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4208_cast_fp16")];
+            tensor<int32, [4]> var_4212_begin_0 = const()[name = string("op_4212_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4212_end_0 = const()[name = string("op_4212_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4212_end_mask_0 = const()[name = string("op_4212_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4212_cast_fp16 = slice_by_index(begin = var_4212_begin_0, end = var_4212_end_0, end_mask = var_4212_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4212_cast_fp16")];
+            tensor<int32, [4]> var_4224_begin_0 = const()[name = string("op_4224_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4224_end_0 = const()[name = string("op_4224_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4224_end_mask_0 = const()[name = string("op_4224_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4224_cast_fp16 = slice_by_index(begin = var_4224_begin_0, end = var_4224_end_0, end_mask = var_4224_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4224_cast_fp16")];
+            tensor<int32, [4]> var_4228_begin_0 = const()[name = string("op_4228_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4228_end_0 = const()[name = string("op_4228_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4228_end_mask_0 = const()[name = string("op_4228_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4228_cast_fp16 = slice_by_index(begin = var_4228_begin_0, end = var_4228_end_0, end_mask = var_4228_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4228_cast_fp16")];
+            tensor<int32, [4]> var_4240_begin_0 = const()[name = string("op_4240_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4240_end_0 = const()[name = string("op_4240_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4240_end_mask_0 = const()[name = string("op_4240_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4240_cast_fp16 = slice_by_index(begin = var_4240_begin_0, end = var_4240_end_0, end_mask = var_4240_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4240_cast_fp16")];
+            tensor<int32, [4]> var_4244_begin_0 = const()[name = string("op_4244_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4244_end_0 = const()[name = string("op_4244_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4244_end_mask_0 = const()[name = string("op_4244_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4244_cast_fp16 = slice_by_index(begin = var_4244_begin_0, end = var_4244_end_0, end_mask = var_4244_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4244_cast_fp16")];
+            tensor<int32, [4]> var_4256_begin_0 = const()[name = string("op_4256_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4256_end_0 = const()[name = string("op_4256_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4256_end_mask_0 = const()[name = string("op_4256_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4256_cast_fp16 = slice_by_index(begin = var_4256_begin_0, end = var_4256_end_0, end_mask = var_4256_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4256_cast_fp16")];
+            tensor<int32, [4]> var_4260_begin_0 = const()[name = string("op_4260_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4260_end_0 = const()[name = string("op_4260_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4260_end_mask_0 = const()[name = string("op_4260_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4260_cast_fp16 = slice_by_index(begin = var_4260_begin_0, end = var_4260_end_0, end_mask = var_4260_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4260_cast_fp16")];
+            tensor<int32, [4]> var_4272_begin_0 = const()[name = string("op_4272_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4272_end_0 = const()[name = string("op_4272_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4272_end_mask_0 = const()[name = string("op_4272_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4272_cast_fp16 = slice_by_index(begin = var_4272_begin_0, end = var_4272_end_0, end_mask = var_4272_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4272_cast_fp16")];
+            tensor<int32, [4]> var_4276_begin_0 = const()[name = string("op_4276_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4276_end_0 = const()[name = string("op_4276_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4276_end_mask_0 = const()[name = string("op_4276_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4276_cast_fp16 = slice_by_index(begin = var_4276_begin_0, end = var_4276_end_0, end_mask = var_4276_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4276_cast_fp16")];
+            tensor<int32, [4]> var_4288_begin_0 = const()[name = string("op_4288_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4288_end_0 = const()[name = string("op_4288_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4288_end_mask_0 = const()[name = string("op_4288_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4288_cast_fp16 = slice_by_index(begin = var_4288_begin_0, end = var_4288_end_0, end_mask = var_4288_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4288_cast_fp16")];
+            tensor<int32, [4]> var_4292_begin_0 = const()[name = string("op_4292_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4292_end_0 = const()[name = string("op_4292_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4292_end_mask_0 = const()[name = string("op_4292_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4292_cast_fp16 = slice_by_index(begin = var_4292_begin_0, end = var_4292_end_0, end_mask = var_4292_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4292_cast_fp16")];
+            tensor<int32, [4]> var_4304_begin_0 = const()[name = string("op_4304_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4304_end_0 = const()[name = string("op_4304_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4304_end_mask_0 = const()[name = string("op_4304_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4304_cast_fp16 = slice_by_index(begin = var_4304_begin_0, end = var_4304_end_0, end_mask = var_4304_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4304_cast_fp16")];
+            tensor<int32, [4]> var_4308_begin_0 = const()[name = string("op_4308_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4308_end_0 = const()[name = string("op_4308_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4308_end_mask_0 = const()[name = string("op_4308_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4308_cast_fp16 = slice_by_index(begin = var_4308_begin_0, end = var_4308_end_0, end_mask = var_4308_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4308_cast_fp16")];
+            tensor<int32, [4]> var_4320_begin_0 = const()[name = string("op_4320_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4320_end_0 = const()[name = string("op_4320_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4320_end_mask_0 = const()[name = string("op_4320_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4320_cast_fp16 = slice_by_index(begin = var_4320_begin_0, end = var_4320_end_0, end_mask = var_4320_end_mask_0, x = key_heads_41_cast_fp16)[name = string("op_4320_cast_fp16")];
+            tensor<int32, [4]> var_4324_begin_0 = const()[name = string("op_4324_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4324_end_0 = const()[name = string("op_4324_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4324_end_mask_0 = const()[name = string("op_4324_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4324_cast_fp16 = slice_by_index(begin = var_4324_begin_0, end = var_4324_end_0, end_mask = var_4324_end_mask_0, x = value_heads_41_cast_fp16)[name = string("op_4324_cast_fp16")];
+            bool key_heads_43_interleave_0 = const()[name = string("key_heads_43_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_43_cast_fp16 = concat(axis = var_4050, interleave = key_heads_43_interleave_0, values = (var_4208_cast_fp16, var_4208_cast_fp16, var_4224_cast_fp16, var_4224_cast_fp16, var_4240_cast_fp16, var_4240_cast_fp16, var_4256_cast_fp16, var_4256_cast_fp16, var_4272_cast_fp16, var_4272_cast_fp16, var_4288_cast_fp16, var_4288_cast_fp16, var_4304_cast_fp16, var_4304_cast_fp16, var_4320_cast_fp16, var_4320_cast_fp16))[name = string("key_heads_43_cast_fp16")];
+            bool value_heads_43_interleave_0 = const()[name = string("value_heads_43_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_43_cast_fp16 = concat(axis = var_4050, interleave = value_heads_43_interleave_0, values = (var_4212_cast_fp16, var_4212_cast_fp16, var_4228_cast_fp16, var_4228_cast_fp16, var_4244_cast_fp16, var_4244_cast_fp16, var_4260_cast_fp16, var_4260_cast_fp16, var_4276_cast_fp16, var_4276_cast_fp16, var_4292_cast_fp16, var_4292_cast_fp16, var_4308_cast_fp16, var_4308_cast_fp16, var_4324_cast_fp16, var_4324_cast_fp16))[name = string("value_heads_43_cast_fp16")];
+            fp16 var_4347_to_fp16 = const()[name = string("op_4347_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_4348_cast_fp16 = mul(x = mh_q_63_cast_fp16, y = var_4347_to_fp16)[name = string("op_4348_cast_fp16")];
+            bool mh_w_41_transpose_x_0 = const()[name = string("mh_w_41_transpose_x_0"), val = bool(true)];
+            bool mh_w_41_transpose_y_0 = const()[name = string("mh_w_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_41_cast_fp16 = matmul(transpose_x = mh_w_41_transpose_x_0, transpose_y = mh_w_41_transpose_y_0, x = var_4348_cast_fp16, y = key_heads_43_cast_fp16)[name = string("mh_w_41_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_43_cast_fp16 = add(x = mh_w_41_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_43_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_4360_cast_fp16 = softmax(axis = var_4032, x = mh_w_43_cast_fp16)[name = string("op_4360_cast_fp16")];
+            bool attn_21_transpose_x_0 = const()[name = string("attn_21_transpose_x_0"), val = bool(false)];
+            bool attn_21_transpose_y_0 = const()[name = string("attn_21_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_21_cast_fp16 = matmul(transpose_x = attn_21_transpose_x_0, transpose_y = attn_21_transpose_y_0, x = value_heads_43_cast_fp16, y = var_4360_cast_fp16)[name = string("attn_21_cast_fp16")];
+            tensor<int32, [4]> var_4365 = const()[name = string("op_4365"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_81_cast_fp16 = reshape(shape = var_4365, x = attn_21_cast_fp16)[name = string("input_81_cast_fp16")];
+            string obj_91_pad_type_0 = const()[name = string("obj_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_91_strides_0 = const()[name = string("obj_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_91_pad_0 = const()[name = string("obj_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_91_dilations_0 = const()[name = string("obj_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_91_groups_0 = const()[name = string("obj_91_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_10_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(511919744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(516114112))))[name = string("layers_10_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_91_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_91_dilations_0, groups = obj_91_groups_0, pad = obj_91_pad_0, pad_type = obj_91_pad_type_0, strides = obj_91_strides_0, weight = layers_10_self_attn_o_proj_weight_to_fp16_palettized, x = input_81_cast_fp16)[name = string("obj_91_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_85_cast_fp16 = add(x = inputs_79_cast_fp16, y = obj_91_cast_fp16)[name = string("inputs_85_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_87_cast_fp16 = mul(x = inputs_85_cast_fp16, y = inputs_85_cast_fp16)[name = string("inputs_sq_87_cast_fp16")];
+            tensor<int32, [1]> variance_87_axes_0 = const()[name = string("variance_87_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_87_keep_dims_0 = const()[name = string("variance_87_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_87_cast_fp16 = reduce_mean(axes = variance_87_axes_0, keep_dims = variance_87_keep_dims_0, x = inputs_sq_87_cast_fp16)[name = string("variance_87_cast_fp16")];
+            fp16 var_4383_to_fp16 = const()[name = string("op_4383_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4384_cast_fp16 = add(x = variance_87_cast_fp16, y = var_4383_to_fp16)[name = string("op_4384_cast_fp16")];
+            fp32 var_4385_epsilon_0 = const()[name = string("op_4385_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4385_cast_fp16 = rsqrt(epsilon = var_4385_epsilon_0, x = var_4384_cast_fp16)[name = string("op_4385_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_107_cast_fp16 = mul(x = inputs_85_cast_fp16, y = var_4385_cast_fp16)[name = string("hidden_states_107_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_87_to_fp16 = const()[name = string("w_87_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(516114688)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_83_cast_fp16 = mul(x = w_87_to_fp16, y = hidden_states_107_cast_fp16)[name = string("input_83_cast_fp16")];
+            string input_85_pad_type_0 = const()[name = string("input_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_85_strides_0 = const()[name = string("input_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_85_pad_0 = const()[name = string("input_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_85_dilations_0 = const()[name = string("input_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_85_groups_0 = const()[name = string("input_85_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_10_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(516118848))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(528701824))))[name = string("layers_10_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_85_cast_fp16 = conv(dilations = input_85_dilations_0, groups = input_85_groups_0, pad = input_85_pad_0, pad_type = input_85_pad_type_0, strides = input_85_strides_0, weight = layers_10_mlp_gate_proj_weight_to_fp16_palettized, x = input_83_cast_fp16)[name = string("input_85_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_4399_cast_fp16 = silu(x = input_85_cast_fp16)[name = string("op_4399_cast_fp16")];
+            string var_4405_pad_type_0 = const()[name = string("op_4405_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4405_strides_0 = const()[name = string("op_4405_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4405_pad_0 = const()[name = string("op_4405_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4405_dilations_0 = const()[name = string("op_4405_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4405_groups_0 = const()[name = string("op_4405_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_10_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(528702400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(541285376))))[name = string("layers_10_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_4405_cast_fp16 = conv(dilations = var_4405_dilations_0, groups = var_4405_groups_0, pad = var_4405_pad_0, pad_type = var_4405_pad_type_0, strides = var_4405_strides_0, weight = layers_10_mlp_up_proj_weight_to_fp16_palettized, x = input_83_cast_fp16)[name = string("op_4405_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_87_cast_fp16 = mul(x = var_4399_cast_fp16, y = var_4405_cast_fp16)[name = string("input_87_cast_fp16")];
+            string hidden_states_109_pad_type_0 = const()[name = string("hidden_states_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_109_strides_0 = const()[name = string("hidden_states_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_109_pad_0 = const()[name = string("hidden_states_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_109_dilations_0 = const()[name = string("hidden_states_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_109_groups_0 = const()[name = string("hidden_states_109_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_10_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(541285952))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553868928))))[name = string("layers_10_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_109_cast_fp16 = conv(dilations = hidden_states_109_dilations_0, groups = hidden_states_109_groups_0, pad = hidden_states_109_pad_0, pad_type = hidden_states_109_pad_type_0, strides = hidden_states_109_strides_0, weight = layers_10_mlp_down_proj_weight_to_fp16_palettized, x = input_87_cast_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_87_cast_fp16 = add(x = inputs_85_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("inputs_87_cast_fp16")];
+            int32 var_4419 = const()[name = string("op_4419"), val = int32(3)];
+            int32 var_4429 = const()[name = string("op_4429"), val = int32(-2)];
+            int32 var_4437 = const()[name = string("op_4437"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_89_cast_fp16 = mul(x = inputs_87_cast_fp16, y = inputs_87_cast_fp16)[name = string("inputs_sq_89_cast_fp16")];
+            tensor<int32, [1]> variance_89_axes_0 = const()[name = string("variance_89_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_89_keep_dims_0 = const()[name = string("variance_89_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_89_cast_fp16 = reduce_mean(axes = variance_89_axes_0, keep_dims = variance_89_keep_dims_0, x = inputs_sq_89_cast_fp16)[name = string("variance_89_cast_fp16")];
+            fp16 var_4449_to_fp16 = const()[name = string("op_4449_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4450_cast_fp16 = add(x = variance_89_cast_fp16, y = var_4449_to_fp16)[name = string("op_4450_cast_fp16")];
+            fp32 var_4451_epsilon_0 = const()[name = string("op_4451_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4451_cast_fp16 = rsqrt(epsilon = var_4451_epsilon_0, x = var_4450_cast_fp16)[name = string("op_4451_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_111_cast_fp16 = mul(x = inputs_87_cast_fp16, y = var_4451_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_89_to_fp16 = const()[name = string("w_89_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553869504)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_93_cast_fp16 = mul(x = w_89_to_fp16, y = hidden_states_111_cast_fp16)[name = string("obj_93_cast_fp16")];
+            string query_67_pad_type_0 = const()[name = string("query_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_67_strides_0 = const()[name = string("query_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_67_pad_0 = const()[name = string("query_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_67_dilations_0 = const()[name = string("query_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_67_groups_0 = const()[name = string("query_67_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_11_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553873664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(558068032))))[name = string("layers_11_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_67_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_67_dilations_0, groups = query_67_groups_0, pad = query_67_pad_0, pad_type = query_67_pad_type_0, strides = query_67_strides_0, weight = layers_11_self_attn_q_proj_weight_to_fp16_palettized, x = obj_93_cast_fp16)[name = string("query_67_cast_fp16")];
+            string current_key_45_pad_type_0 = const()[name = string("current_key_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_45_strides_0 = const()[name = string("current_key_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_45_pad_0 = const()[name = string("current_key_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_45_dilations_0 = const()[name = string("current_key_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_45_groups_0 = const()[name = string("current_key_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_11_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(558068608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560165824))))[name = string("layers_11_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_45_cast_fp16 = conv(dilations = current_key_45_dilations_0, groups = current_key_45_groups_0, pad = current_key_45_pad_0, pad_type = current_key_45_pad_type_0, strides = current_key_45_strides_0, weight = layers_11_self_attn_k_proj_weight_to_fp16_palettized, x = obj_93_cast_fp16)[name = string("current_key_45_cast_fp16")];
+            string current_value_23_pad_type_0 = const()[name = string("current_value_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_23_strides_0 = const()[name = string("current_value_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_23_pad_0 = const()[name = string("current_value_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_23_dilations_0 = const()[name = string("current_value_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_23_groups_0 = const()[name = string("current_value_23_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_11_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560166400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(562263616))))[name = string("layers_11_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_23_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_23_dilations_0, groups = current_value_23_groups_0, pad = current_value_23_pad_0, pad_type = current_value_23_pad_type_0, strides = current_value_23_strides_0, weight = layers_11_self_attn_v_proj_weight_to_fp16_palettized, x = obj_93_cast_fp16)[name = string("current_value_23_cast_fp16")];
+            tensor<int32, [4]> var_4488 = const()[name = string("op_4488"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_89_cast_fp16 = reshape(shape = var_4488, x = query_67_cast_fp16)[name = string("inputs_89_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_91_cast_fp16 = mul(x = inputs_89_cast_fp16, y = inputs_89_cast_fp16)[name = string("inputs_sq_91_cast_fp16")];
+            tensor<int32, [1]> variance_91_axes_0 = const()[name = string("variance_91_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_91_keep_dims_0 = const()[name = string("variance_91_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_91_cast_fp16 = reduce_mean(axes = variance_91_axes_0, keep_dims = variance_91_keep_dims_0, x = inputs_sq_91_cast_fp16)[name = string("variance_91_cast_fp16")];
+            fp16 var_4494_to_fp16 = const()[name = string("op_4494_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_4495_cast_fp16 = add(x = variance_91_cast_fp16, y = var_4494_to_fp16)[name = string("op_4495_cast_fp16")];
+            fp32 var_4496_epsilon_0 = const()[name = string("op_4496_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_4496_cast_fp16 = rsqrt(epsilon = var_4496_epsilon_0, x = var_4495_cast_fp16)[name = string("op_4496_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_113_cast_fp16 = mul(x = inputs_89_cast_fp16, y = var_4496_cast_fp16)[name = string("hidden_states_113_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_91_to_fp16 = const()[name = string("w_91_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(562264192)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_23_cast_fp16 = mul(x = w_91_to_fp16, y = hidden_states_113_cast_fp16)[name = string("query_normed_23_cast_fp16")];
+            tensor<int32, [4]> var_4504 = const()[name = string("op_4504"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_91_cast_fp16 = reshape(shape = var_4504, x = current_key_45_cast_fp16)[name = string("inputs_91_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_93_cast_fp16 = mul(x = inputs_91_cast_fp16, y = inputs_91_cast_fp16)[name = string("inputs_sq_93_cast_fp16")];
+            tensor<int32, [1]> variance_93_axes_0 = const()[name = string("variance_93_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_93_keep_dims_0 = const()[name = string("variance_93_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_93_cast_fp16 = reduce_mean(axes = variance_93_axes_0, keep_dims = variance_93_keep_dims_0, x = inputs_sq_93_cast_fp16)[name = string("variance_93_cast_fp16")];
+            fp16 var_4510_to_fp16 = const()[name = string("op_4510_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_4511_cast_fp16 = add(x = variance_93_cast_fp16, y = var_4510_to_fp16)[name = string("op_4511_cast_fp16")];
+            fp32 var_4512_epsilon_0 = const()[name = string("op_4512_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_4512_cast_fp16 = rsqrt(epsilon = var_4512_epsilon_0, x = var_4511_cast_fp16)[name = string("op_4512_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_115_cast_fp16 = mul(x = inputs_91_cast_fp16, y = var_4512_cast_fp16)[name = string("hidden_states_115_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_93_to_fp16 = const()[name = string("w_93_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(562264512)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_23_cast_fp16 = mul(x = w_93_to_fp16, y = hidden_states_115_cast_fp16)[name = string("current_key_normed_23_cast_fp16")];
+            tensor<int32, [4]> var_4530 = const()[name = string("op_4530"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_67_cast_fp16 = reshape(shape = var_4530, x = query_normed_23_cast_fp16)[name = string("mh_q_67_cast_fp16")];
+            tensor<int32, [4]> var_4532 = const()[name = string("op_4532"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_45_cast_fp16 = reshape(shape = var_4532, x = current_key_normed_23_cast_fp16)[name = string("mh_k_45_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4536_cast_fp16 = mul(x = mh_q_67_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4536_cast_fp16")];
+            tensor<int32, [4]> var_4541_begin_0 = const()[name = string("op_4541_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4541_end_0 = const()[name = string("op_4541_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_4541_end_mask_0 = const()[name = string("op_4541_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4541_cast_fp16 = slice_by_index(begin = var_4541_begin_0, end = var_4541_end_0, end_mask = var_4541_end_mask_0, x = mh_q_67_cast_fp16)[name = string("op_4541_cast_fp16")];
+            tensor<int32, [4]> var_4547_begin_0 = const()[name = string("op_4547_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4547_end_0 = const()[name = string("op_4547_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_4547_end_mask_0 = const()[name = string("op_4547_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4547_cast_fp16 = slice_by_index(begin = var_4547_begin_0, end = var_4547_end_0, end_mask = var_4547_end_mask_0, x = mh_q_67_cast_fp16)[name = string("op_4547_cast_fp16")];
+            fp16 const_270_promoted_to_fp16 = const()[name = string("const_270_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_4549_cast_fp16 = mul(x = var_4547_cast_fp16, y = const_270_promoted_to_fp16)[name = string("op_4549_cast_fp16")];
+            bool var_4551_interleave_0 = const()[name = string("op_4551_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_4551_cast_fp16 = concat(axis = var_4429, interleave = var_4551_interleave_0, values = (var_4549_cast_fp16, var_4541_cast_fp16))[name = string("op_4551_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4552_cast_fp16 = mul(x = var_4551_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4552_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_69_cast_fp16 = add(x = var_4536_cast_fp16, y = var_4552_cast_fp16)[name = string("mh_q_69_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4554_cast_fp16 = mul(x = mh_k_45_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4554_cast_fp16")];
+            tensor<int32, [4]> var_4559_begin_0 = const()[name = string("op_4559_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4559_end_0 = const()[name = string("op_4559_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_4559_end_mask_0 = const()[name = string("op_4559_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4559_cast_fp16 = slice_by_index(begin = var_4559_begin_0, end = var_4559_end_0, end_mask = var_4559_end_mask_0, x = mh_k_45_cast_fp16)[name = string("op_4559_cast_fp16")];
+            tensor<int32, [4]> var_4565_begin_0 = const()[name = string("op_4565_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4565_end_0 = const()[name = string("op_4565_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_4565_end_mask_0 = const()[name = string("op_4565_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4565_cast_fp16 = slice_by_index(begin = var_4565_begin_0, end = var_4565_end_0, end_mask = var_4565_end_mask_0, x = mh_k_45_cast_fp16)[name = string("op_4565_cast_fp16")];
+            fp16 const_273_promoted_to_fp16 = const()[name = string("const_273_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_4567_cast_fp16 = mul(x = var_4565_cast_fp16, y = const_273_promoted_to_fp16)[name = string("op_4567_cast_fp16")];
+            bool var_4569_interleave_0 = const()[name = string("op_4569_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_4569_cast_fp16 = concat(axis = var_4429, interleave = var_4569_interleave_0, values = (var_4567_cast_fp16, var_4559_cast_fp16))[name = string("op_4569_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4570_cast_fp16 = mul(x = var_4569_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4570_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_47_cast_fp16 = add(x = var_4554_cast_fp16, y = var_4570_cast_fp16)[name = string("mh_k_47_cast_fp16")];
+            tensor<int32, [4]> var_4574 = const()[name = string("op_4574"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_47_cast_fp16 = reshape(shape = var_4574, x = mh_k_47_cast_fp16)[name = string("current_key_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4581_cast_fp16 = mul(x = var_101_cast_fp16_11, y = var_323_cast_fp16)[name = string("op_4581_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4582_cast_fp16 = mul(x = current_key_47_cast_fp16, y = var_321_cast_fp16)[name = string("op_4582_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_69_cast_fp16 = add(x = var_4581_cast_fp16, y = var_4582_cast_fp16)[name = string("key_69_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4585_cast_fp16 = mul(x = var_132_cast_fp16_11, y = var_323_cast_fp16)[name = string("op_4585_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4586_cast_fp16 = mul(x = current_value_23_cast_fp16, y = var_321_cast_fp16)[name = string("op_4586_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_45_cast_fp16 = add(x = var_4585_cast_fp16, y = var_4586_cast_fp16)[name = string("value_45_cast_fp16")];
+            tensor<int32, [4]> var_4590 = const()[name = string("op_4590"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_45_cast_fp16 = reshape(shape = var_4590, x = key_69_cast_fp16)[name = string("key_heads_45_cast_fp16")];
+            tensor<int32, [4]> var_4592 = const()[name = string("op_4592"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_45_cast_fp16 = reshape(shape = var_4592, x = value_45_cast_fp16)[name = string("value_heads_45_cast_fp16")];
+            tensor<int32, [4]> var_4595_begin_0 = const()[name = string("op_4595_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4595_end_0 = const()[name = string("op_4595_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4595_end_mask_0 = const()[name = string("op_4595_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4595_cast_fp16 = slice_by_index(begin = var_4595_begin_0, end = var_4595_end_0, end_mask = var_4595_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4595_cast_fp16")];
+            tensor<int32, [4]> var_4599_begin_0 = const()[name = string("op_4599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4599_end_0 = const()[name = string("op_4599_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4599_end_mask_0 = const()[name = string("op_4599_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4599_cast_fp16 = slice_by_index(begin = var_4599_begin_0, end = var_4599_end_0, end_mask = var_4599_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4599_cast_fp16")];
+            tensor<int32, [4]> var_4611_begin_0 = const()[name = string("op_4611_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4611_end_0 = const()[name = string("op_4611_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4611_end_mask_0 = const()[name = string("op_4611_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4611_cast_fp16 = slice_by_index(begin = var_4611_begin_0, end = var_4611_end_0, end_mask = var_4611_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4611_cast_fp16")];
+            tensor<int32, [4]> var_4615_begin_0 = const()[name = string("op_4615_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4615_end_0 = const()[name = string("op_4615_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4615_end_mask_0 = const()[name = string("op_4615_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4615_cast_fp16 = slice_by_index(begin = var_4615_begin_0, end = var_4615_end_0, end_mask = var_4615_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4615_cast_fp16")];
+            tensor<int32, [4]> var_4627_begin_0 = const()[name = string("op_4627_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4627_end_0 = const()[name = string("op_4627_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4627_end_mask_0 = const()[name = string("op_4627_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4627_cast_fp16 = slice_by_index(begin = var_4627_begin_0, end = var_4627_end_0, end_mask = var_4627_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4627_cast_fp16")];
+            tensor<int32, [4]> var_4631_begin_0 = const()[name = string("op_4631_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_4631_end_0 = const()[name = string("op_4631_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_4631_end_mask_0 = const()[name = string("op_4631_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4631_cast_fp16 = slice_by_index(begin = var_4631_begin_0, end = var_4631_end_0, end_mask = var_4631_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4631_cast_fp16")];
+            tensor<int32, [4]> var_4643_begin_0 = const()[name = string("op_4643_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4643_end_0 = const()[name = string("op_4643_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4643_end_mask_0 = const()[name = string("op_4643_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4643_cast_fp16 = slice_by_index(begin = var_4643_begin_0, end = var_4643_end_0, end_mask = var_4643_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4643_cast_fp16")];
+            tensor<int32, [4]> var_4647_begin_0 = const()[name = string("op_4647_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_4647_end_0 = const()[name = string("op_4647_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_4647_end_mask_0 = const()[name = string("op_4647_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4647_cast_fp16 = slice_by_index(begin = var_4647_begin_0, end = var_4647_end_0, end_mask = var_4647_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4647_cast_fp16")];
+            tensor<int32, [4]> var_4659_begin_0 = const()[name = string("op_4659_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4659_end_0 = const()[name = string("op_4659_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4659_end_mask_0 = const()[name = string("op_4659_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4659_cast_fp16 = slice_by_index(begin = var_4659_begin_0, end = var_4659_end_0, end_mask = var_4659_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4659_cast_fp16")];
+            tensor<int32, [4]> var_4663_begin_0 = const()[name = string("op_4663_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_4663_end_0 = const()[name = string("op_4663_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_4663_end_mask_0 = const()[name = string("op_4663_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4663_cast_fp16 = slice_by_index(begin = var_4663_begin_0, end = var_4663_end_0, end_mask = var_4663_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4663_cast_fp16")];
+            tensor<int32, [4]> var_4675_begin_0 = const()[name = string("op_4675_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4675_end_0 = const()[name = string("op_4675_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4675_end_mask_0 = const()[name = string("op_4675_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4675_cast_fp16 = slice_by_index(begin = var_4675_begin_0, end = var_4675_end_0, end_mask = var_4675_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4675_cast_fp16")];
+            tensor<int32, [4]> var_4679_begin_0 = const()[name = string("op_4679_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_4679_end_0 = const()[name = string("op_4679_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_4679_end_mask_0 = const()[name = string("op_4679_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4679_cast_fp16 = slice_by_index(begin = var_4679_begin_0, end = var_4679_end_0, end_mask = var_4679_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4679_cast_fp16")];
+            tensor<int32, [4]> var_4691_begin_0 = const()[name = string("op_4691_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4691_end_0 = const()[name = string("op_4691_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4691_end_mask_0 = const()[name = string("op_4691_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4691_cast_fp16 = slice_by_index(begin = var_4691_begin_0, end = var_4691_end_0, end_mask = var_4691_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4691_cast_fp16")];
+            tensor<int32, [4]> var_4695_begin_0 = const()[name = string("op_4695_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_4695_end_0 = const()[name = string("op_4695_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_4695_end_mask_0 = const()[name = string("op_4695_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4695_cast_fp16 = slice_by_index(begin = var_4695_begin_0, end = var_4695_end_0, end_mask = var_4695_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4695_cast_fp16")];
+            tensor<int32, [4]> var_4707_begin_0 = const()[name = string("op_4707_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4707_end_0 = const()[name = string("op_4707_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4707_end_mask_0 = const()[name = string("op_4707_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4707_cast_fp16 = slice_by_index(begin = var_4707_begin_0, end = var_4707_end_0, end_mask = var_4707_end_mask_0, x = key_heads_45_cast_fp16)[name = string("op_4707_cast_fp16")];
+            tensor<int32, [4]> var_4711_begin_0 = const()[name = string("op_4711_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_4711_end_0 = const()[name = string("op_4711_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4711_end_mask_0 = const()[name = string("op_4711_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4711_cast_fp16 = slice_by_index(begin = var_4711_begin_0, end = var_4711_end_0, end_mask = var_4711_end_mask_0, x = value_heads_45_cast_fp16)[name = string("op_4711_cast_fp16")];
+            bool key_heads_47_interleave_0 = const()[name = string("key_heads_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_47_cast_fp16 = concat(axis = var_4437, interleave = key_heads_47_interleave_0, values = (var_4595_cast_fp16, var_4595_cast_fp16, var_4611_cast_fp16, var_4611_cast_fp16, var_4627_cast_fp16, var_4627_cast_fp16, var_4643_cast_fp16, var_4643_cast_fp16, var_4659_cast_fp16, var_4659_cast_fp16, var_4675_cast_fp16, var_4675_cast_fp16, var_4691_cast_fp16, var_4691_cast_fp16, var_4707_cast_fp16, var_4707_cast_fp16))[name = string("key_heads_47_cast_fp16")];
+            bool value_heads_47_interleave_0 = const()[name = string("value_heads_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_47_cast_fp16 = concat(axis = var_4437, interleave = value_heads_47_interleave_0, values = (var_4599_cast_fp16, var_4599_cast_fp16, var_4615_cast_fp16, var_4615_cast_fp16, var_4631_cast_fp16, var_4631_cast_fp16, var_4647_cast_fp16, var_4647_cast_fp16, var_4663_cast_fp16, var_4663_cast_fp16, var_4679_cast_fp16, var_4679_cast_fp16, var_4695_cast_fp16, var_4695_cast_fp16, var_4711_cast_fp16, var_4711_cast_fp16))[name = string("value_heads_47_cast_fp16")];
+            fp16 var_4734_to_fp16 = const()[name = string("op_4734_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_4735_cast_fp16 = mul(x = mh_q_69_cast_fp16, y = var_4734_to_fp16)[name = string("op_4735_cast_fp16")];
+            bool mh_w_45_transpose_x_0 = const()[name = string("mh_w_45_transpose_x_0"), val = bool(true)];
+            bool mh_w_45_transpose_y_0 = const()[name = string("mh_w_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_45_cast_fp16 = matmul(transpose_x = mh_w_45_transpose_x_0, transpose_y = mh_w_45_transpose_y_0, x = var_4735_cast_fp16, y = key_heads_47_cast_fp16)[name = string("mh_w_45_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_47_cast_fp16 = add(x = mh_w_45_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_47_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_4747_cast_fp16 = softmax(axis = var_4419, x = mh_w_47_cast_fp16)[name = string("op_4747_cast_fp16")];
+            bool attn_23_transpose_x_0 = const()[name = string("attn_23_transpose_x_0"), val = bool(false)];
+            bool attn_23_transpose_y_0 = const()[name = string("attn_23_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_23_cast_fp16 = matmul(transpose_x = attn_23_transpose_x_0, transpose_y = attn_23_transpose_y_0, x = value_heads_47_cast_fp16, y = var_4747_cast_fp16)[name = string("attn_23_cast_fp16")];
+            tensor<int32, [4]> var_4752 = const()[name = string("op_4752"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_89_cast_fp16 = reshape(shape = var_4752, x = attn_23_cast_fp16)[name = string("input_89_cast_fp16")];
+            string obj_99_pad_type_0 = const()[name = string("obj_99_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_99_strides_0 = const()[name = string("obj_99_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_99_pad_0 = const()[name = string("obj_99_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_99_dilations_0 = const()[name = string("obj_99_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_99_groups_0 = const()[name = string("obj_99_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_11_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(562264832))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566459200))))[name = string("layers_11_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_99_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_99_dilations_0, groups = obj_99_groups_0, pad = obj_99_pad_0, pad_type = obj_99_pad_type_0, strides = obj_99_strides_0, weight = layers_11_self_attn_o_proj_weight_to_fp16_palettized, x = input_89_cast_fp16)[name = string("obj_99_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_93_cast_fp16 = add(x = inputs_87_cast_fp16, y = obj_99_cast_fp16)[name = string("inputs_93_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_95_cast_fp16 = mul(x = inputs_93_cast_fp16, y = inputs_93_cast_fp16)[name = string("inputs_sq_95_cast_fp16")];
+            tensor<int32, [1]> variance_95_axes_0 = const()[name = string("variance_95_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_95_keep_dims_0 = const()[name = string("variance_95_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_95_cast_fp16 = reduce_mean(axes = variance_95_axes_0, keep_dims = variance_95_keep_dims_0, x = inputs_sq_95_cast_fp16)[name = string("variance_95_cast_fp16")];
+            fp16 var_4770_to_fp16 = const()[name = string("op_4770_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4771_cast_fp16 = add(x = variance_95_cast_fp16, y = var_4770_to_fp16)[name = string("op_4771_cast_fp16")];
+            fp32 var_4772_epsilon_0 = const()[name = string("op_4772_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4772_cast_fp16 = rsqrt(epsilon = var_4772_epsilon_0, x = var_4771_cast_fp16)[name = string("op_4772_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_117_cast_fp16 = mul(x = inputs_93_cast_fp16, y = var_4772_cast_fp16)[name = string("hidden_states_117_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_95_to_fp16 = const()[name = string("w_95_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566459776)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_91_cast_fp16 = mul(x = w_95_to_fp16, y = hidden_states_117_cast_fp16)[name = string("input_91_cast_fp16")];
+            string input_93_pad_type_0 = const()[name = string("input_93_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_93_strides_0 = const()[name = string("input_93_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_93_pad_0 = const()[name = string("input_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_93_dilations_0 = const()[name = string("input_93_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_93_groups_0 = const()[name = string("input_93_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_11_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566463936))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(579046912))))[name = string("layers_11_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_93_cast_fp16 = conv(dilations = input_93_dilations_0, groups = input_93_groups_0, pad = input_93_pad_0, pad_type = input_93_pad_type_0, strides = input_93_strides_0, weight = layers_11_mlp_gate_proj_weight_to_fp16_palettized, x = input_91_cast_fp16)[name = string("input_93_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_4786_cast_fp16 = silu(x = input_93_cast_fp16)[name = string("op_4786_cast_fp16")];
+            string var_4792_pad_type_0 = const()[name = string("op_4792_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4792_strides_0 = const()[name = string("op_4792_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4792_pad_0 = const()[name = string("op_4792_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4792_dilations_0 = const()[name = string("op_4792_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4792_groups_0 = const()[name = string("op_4792_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_11_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(579047488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(591630464))))[name = string("layers_11_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_4792_cast_fp16 = conv(dilations = var_4792_dilations_0, groups = var_4792_groups_0, pad = var_4792_pad_0, pad_type = var_4792_pad_type_0, strides = var_4792_strides_0, weight = layers_11_mlp_up_proj_weight_to_fp16_palettized, x = input_91_cast_fp16)[name = string("op_4792_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_95_cast_fp16 = mul(x = var_4786_cast_fp16, y = var_4792_cast_fp16)[name = string("input_95_cast_fp16")];
+            string hidden_states_119_pad_type_0 = const()[name = string("hidden_states_119_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_119_strides_0 = const()[name = string("hidden_states_119_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_119_pad_0 = const()[name = string("hidden_states_119_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_119_dilations_0 = const()[name = string("hidden_states_119_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_119_groups_0 = const()[name = string("hidden_states_119_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_11_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(591631040))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(604214016))))[name = string("layers_11_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_119_cast_fp16 = conv(dilations = hidden_states_119_dilations_0, groups = hidden_states_119_groups_0, pad = hidden_states_119_pad_0, pad_type = hidden_states_119_pad_type_0, strides = hidden_states_119_strides_0, weight = layers_11_mlp_down_proj_weight_to_fp16_palettized, x = input_95_cast_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_95_cast_fp16 = add(x = inputs_93_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("inputs_95_cast_fp16")];
+            int32 var_4806 = const()[name = string("op_4806"), val = int32(3)];
+            int32 var_4816 = const()[name = string("op_4816"), val = int32(-2)];
+            int32 var_4824 = const()[name = string("op_4824"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_97_cast_fp16 = mul(x = inputs_95_cast_fp16, y = inputs_95_cast_fp16)[name = string("inputs_sq_97_cast_fp16")];
+            tensor<int32, [1]> variance_97_axes_0 = const()[name = string("variance_97_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_97_keep_dims_0 = const()[name = string("variance_97_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_97_cast_fp16 = reduce_mean(axes = variance_97_axes_0, keep_dims = variance_97_keep_dims_0, x = inputs_sq_97_cast_fp16)[name = string("variance_97_cast_fp16")];
+            fp16 var_4836_to_fp16 = const()[name = string("op_4836_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_4837_cast_fp16 = add(x = variance_97_cast_fp16, y = var_4836_to_fp16)[name = string("op_4837_cast_fp16")];
+            fp32 var_4838_epsilon_0 = const()[name = string("op_4838_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_4838_cast_fp16 = rsqrt(epsilon = var_4838_epsilon_0, x = var_4837_cast_fp16)[name = string("op_4838_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_121_cast_fp16 = mul(x = inputs_95_cast_fp16, y = var_4838_cast_fp16)[name = string("hidden_states_121_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_97_to_fp16 = const()[name = string("w_97_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(604214592)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_101_cast_fp16 = mul(x = w_97_to_fp16, y = hidden_states_121_cast_fp16)[name = string("obj_101_cast_fp16")];
+            string query_73_pad_type_0 = const()[name = string("query_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_73_strides_0 = const()[name = string("query_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_73_pad_0 = const()[name = string("query_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_73_dilations_0 = const()[name = string("query_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_73_groups_0 = const()[name = string("query_73_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_12_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(604218752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(608413120))))[name = string("layers_12_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_73_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_73_dilations_0, groups = query_73_groups_0, pad = query_73_pad_0, pad_type = query_73_pad_type_0, strides = query_73_strides_0, weight = layers_12_self_attn_q_proj_weight_to_fp16_palettized, x = obj_101_cast_fp16)[name = string("query_73_cast_fp16")];
+            string current_key_49_pad_type_0 = const()[name = string("current_key_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_49_strides_0 = const()[name = string("current_key_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_49_pad_0 = const()[name = string("current_key_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_49_dilations_0 = const()[name = string("current_key_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_49_groups_0 = const()[name = string("current_key_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_12_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(608413696))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(610510912))))[name = string("layers_12_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_49_cast_fp16 = conv(dilations = current_key_49_dilations_0, groups = current_key_49_groups_0, pad = current_key_49_pad_0, pad_type = current_key_49_pad_type_0, strides = current_key_49_strides_0, weight = layers_12_self_attn_k_proj_weight_to_fp16_palettized, x = obj_101_cast_fp16)[name = string("current_key_49_cast_fp16")];
+            string current_value_25_pad_type_0 = const()[name = string("current_value_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_25_strides_0 = const()[name = string("current_value_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_25_pad_0 = const()[name = string("current_value_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_25_dilations_0 = const()[name = string("current_value_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_25_groups_0 = const()[name = string("current_value_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_12_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(610511488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(612608704))))[name = string("layers_12_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_25_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_25_dilations_0, groups = current_value_25_groups_0, pad = current_value_25_pad_0, pad_type = current_value_25_pad_type_0, strides = current_value_25_strides_0, weight = layers_12_self_attn_v_proj_weight_to_fp16_palettized, x = obj_101_cast_fp16)[name = string("current_value_25_cast_fp16")];
+            tensor<int32, [4]> var_4875 = const()[name = string("op_4875"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_97_cast_fp16 = reshape(shape = var_4875, x = query_73_cast_fp16)[name = string("inputs_97_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_99_cast_fp16 = mul(x = inputs_97_cast_fp16, y = inputs_97_cast_fp16)[name = string("inputs_sq_99_cast_fp16")];
+            tensor<int32, [1]> variance_99_axes_0 = const()[name = string("variance_99_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_99_keep_dims_0 = const()[name = string("variance_99_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_99_cast_fp16 = reduce_mean(axes = variance_99_axes_0, keep_dims = variance_99_keep_dims_0, x = inputs_sq_99_cast_fp16)[name = string("variance_99_cast_fp16")];
+            fp16 var_4881_to_fp16 = const()[name = string("op_4881_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_4882_cast_fp16 = add(x = variance_99_cast_fp16, y = var_4881_to_fp16)[name = string("op_4882_cast_fp16")];
+            fp32 var_4883_epsilon_0 = const()[name = string("op_4883_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_4883_cast_fp16 = rsqrt(epsilon = var_4883_epsilon_0, x = var_4882_cast_fp16)[name = string("op_4883_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_123_cast_fp16 = mul(x = inputs_97_cast_fp16, y = var_4883_cast_fp16)[name = string("hidden_states_123_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_99_to_fp16 = const()[name = string("w_99_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(612609280)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_25_cast_fp16 = mul(x = w_99_to_fp16, y = hidden_states_123_cast_fp16)[name = string("query_normed_25_cast_fp16")];
+            tensor<int32, [4]> var_4891 = const()[name = string("op_4891"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_99_cast_fp16 = reshape(shape = var_4891, x = current_key_49_cast_fp16)[name = string("inputs_99_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_101_cast_fp16 = mul(x = inputs_99_cast_fp16, y = inputs_99_cast_fp16)[name = string("inputs_sq_101_cast_fp16")];
+            tensor<int32, [1]> variance_101_axes_0 = const()[name = string("variance_101_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_101_keep_dims_0 = const()[name = string("variance_101_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_101_cast_fp16 = reduce_mean(axes = variance_101_axes_0, keep_dims = variance_101_keep_dims_0, x = inputs_sq_101_cast_fp16)[name = string("variance_101_cast_fp16")];
+            fp16 var_4897_to_fp16 = const()[name = string("op_4897_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_4898_cast_fp16 = add(x = variance_101_cast_fp16, y = var_4897_to_fp16)[name = string("op_4898_cast_fp16")];
+            fp32 var_4899_epsilon_0 = const()[name = string("op_4899_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_4899_cast_fp16 = rsqrt(epsilon = var_4899_epsilon_0, x = var_4898_cast_fp16)[name = string("op_4899_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_125_cast_fp16 = mul(x = inputs_99_cast_fp16, y = var_4899_cast_fp16)[name = string("hidden_states_125_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_101_to_fp16 = const()[name = string("w_101_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(612609600)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_25_cast_fp16 = mul(x = w_101_to_fp16, y = hidden_states_125_cast_fp16)[name = string("current_key_normed_25_cast_fp16")];
+            tensor<int32, [4]> var_4917 = const()[name = string("op_4917"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_73_cast_fp16 = reshape(shape = var_4917, x = query_normed_25_cast_fp16)[name = string("mh_q_73_cast_fp16")];
+            tensor<int32, [4]> var_4919 = const()[name = string("op_4919"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_49_cast_fp16 = reshape(shape = var_4919, x = current_key_normed_25_cast_fp16)[name = string("mh_k_49_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4923_cast_fp16 = mul(x = mh_q_73_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4923_cast_fp16")];
+            tensor<int32, [4]> var_4928_begin_0 = const()[name = string("op_4928_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4928_end_0 = const()[name = string("op_4928_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_4928_end_mask_0 = const()[name = string("op_4928_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4928_cast_fp16 = slice_by_index(begin = var_4928_begin_0, end = var_4928_end_0, end_mask = var_4928_end_mask_0, x = mh_q_73_cast_fp16)[name = string("op_4928_cast_fp16")];
+            tensor<int32, [4]> var_4934_begin_0 = const()[name = string("op_4934_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4934_end_0 = const()[name = string("op_4934_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_4934_end_mask_0 = const()[name = string("op_4934_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_4934_cast_fp16 = slice_by_index(begin = var_4934_begin_0, end = var_4934_end_0, end_mask = var_4934_end_mask_0, x = mh_q_73_cast_fp16)[name = string("op_4934_cast_fp16")];
+            fp16 const_293_promoted_to_fp16 = const()[name = string("const_293_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_4936_cast_fp16 = mul(x = var_4934_cast_fp16, y = const_293_promoted_to_fp16)[name = string("op_4936_cast_fp16")];
+            bool var_4938_interleave_0 = const()[name = string("op_4938_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_4938_cast_fp16 = concat(axis = var_4816, interleave = var_4938_interleave_0, values = (var_4936_cast_fp16, var_4928_cast_fp16))[name = string("op_4938_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_4939_cast_fp16 = mul(x = var_4938_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4939_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_75_cast_fp16 = add(x = var_4923_cast_fp16, y = var_4939_cast_fp16)[name = string("mh_q_75_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4941_cast_fp16 = mul(x = mh_k_49_cast_fp16, y = cos_1_cast_fp16)[name = string("op_4941_cast_fp16")];
+            tensor<int32, [4]> var_4946_begin_0 = const()[name = string("op_4946_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4946_end_0 = const()[name = string("op_4946_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_4946_end_mask_0 = const()[name = string("op_4946_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4946_cast_fp16 = slice_by_index(begin = var_4946_begin_0, end = var_4946_end_0, end_mask = var_4946_end_mask_0, x = mh_k_49_cast_fp16)[name = string("op_4946_cast_fp16")];
+            tensor<int32, [4]> var_4952_begin_0 = const()[name = string("op_4952_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_4952_end_0 = const()[name = string("op_4952_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_4952_end_mask_0 = const()[name = string("op_4952_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_4952_cast_fp16 = slice_by_index(begin = var_4952_begin_0, end = var_4952_end_0, end_mask = var_4952_end_mask_0, x = mh_k_49_cast_fp16)[name = string("op_4952_cast_fp16")];
+            fp16 const_296_promoted_to_fp16 = const()[name = string("const_296_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_4954_cast_fp16 = mul(x = var_4952_cast_fp16, y = const_296_promoted_to_fp16)[name = string("op_4954_cast_fp16")];
+            bool var_4956_interleave_0 = const()[name = string("op_4956_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_4956_cast_fp16 = concat(axis = var_4816, interleave = var_4956_interleave_0, values = (var_4954_cast_fp16, var_4946_cast_fp16))[name = string("op_4956_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_4957_cast_fp16 = mul(x = var_4956_cast_fp16, y = sin_1_cast_fp16)[name = string("op_4957_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_51_cast_fp16 = add(x = var_4941_cast_fp16, y = var_4957_cast_fp16)[name = string("mh_k_51_cast_fp16")];
+            tensor<int32, [4]> var_4961 = const()[name = string("op_4961"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_51_cast_fp16 = reshape(shape = var_4961, x = mh_k_51_cast_fp16)[name = string("current_key_51_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4968_cast_fp16 = mul(x = var_101_cast_fp16_12, y = var_323_cast_fp16)[name = string("op_4968_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4969_cast_fp16 = mul(x = current_key_51_cast_fp16, y = var_321_cast_fp16)[name = string("op_4969_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_75_cast_fp16 = add(x = var_4968_cast_fp16, y = var_4969_cast_fp16)[name = string("key_75_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4972_cast_fp16 = mul(x = var_132_cast_fp16_12, y = var_323_cast_fp16)[name = string("op_4972_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_4973_cast_fp16 = mul(x = current_value_25_cast_fp16, y = var_321_cast_fp16)[name = string("op_4973_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_49_cast_fp16 = add(x = var_4972_cast_fp16, y = var_4973_cast_fp16)[name = string("value_49_cast_fp16")];
+            tensor<int32, [4]> var_4977 = const()[name = string("op_4977"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_49_cast_fp16 = reshape(shape = var_4977, x = key_75_cast_fp16)[name = string("key_heads_49_cast_fp16")];
+            tensor<int32, [4]> var_4979 = const()[name = string("op_4979"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_49_cast_fp16 = reshape(shape = var_4979, x = value_49_cast_fp16)[name = string("value_heads_49_cast_fp16")];
+            tensor<int32, [4]> var_4982_begin_0 = const()[name = string("op_4982_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4982_end_0 = const()[name = string("op_4982_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4982_end_mask_0 = const()[name = string("op_4982_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4982_cast_fp16 = slice_by_index(begin = var_4982_begin_0, end = var_4982_end_0, end_mask = var_4982_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_4982_cast_fp16")];
+            tensor<int32, [4]> var_4986_begin_0 = const()[name = string("op_4986_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4986_end_0 = const()[name = string("op_4986_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_4986_end_mask_0 = const()[name = string("op_4986_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4986_cast_fp16 = slice_by_index(begin = var_4986_begin_0, end = var_4986_end_0, end_mask = var_4986_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_4986_cast_fp16")];
+            tensor<int32, [4]> var_4998_begin_0 = const()[name = string("op_4998_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_4998_end_0 = const()[name = string("op_4998_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_4998_end_mask_0 = const()[name = string("op_4998_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_4998_cast_fp16 = slice_by_index(begin = var_4998_begin_0, end = var_4998_end_0, end_mask = var_4998_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_4998_cast_fp16")];
+            tensor<int32, [4]> var_5002_begin_0 = const()[name = string("op_5002_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5002_end_0 = const()[name = string("op_5002_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5002_end_mask_0 = const()[name = string("op_5002_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5002_cast_fp16 = slice_by_index(begin = var_5002_begin_0, end = var_5002_end_0, end_mask = var_5002_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5002_cast_fp16")];
+            tensor<int32, [4]> var_5014_begin_0 = const()[name = string("op_5014_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5014_end_0 = const()[name = string("op_5014_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5014_end_mask_0 = const()[name = string("op_5014_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5014_cast_fp16 = slice_by_index(begin = var_5014_begin_0, end = var_5014_end_0, end_mask = var_5014_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5014_cast_fp16")];
+            tensor<int32, [4]> var_5018_begin_0 = const()[name = string("op_5018_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5018_end_0 = const()[name = string("op_5018_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5018_end_mask_0 = const()[name = string("op_5018_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5018_cast_fp16 = slice_by_index(begin = var_5018_begin_0, end = var_5018_end_0, end_mask = var_5018_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5018_cast_fp16")];
+            tensor<int32, [4]> var_5030_begin_0 = const()[name = string("op_5030_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5030_end_0 = const()[name = string("op_5030_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5030_end_mask_0 = const()[name = string("op_5030_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5030_cast_fp16 = slice_by_index(begin = var_5030_begin_0, end = var_5030_end_0, end_mask = var_5030_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5030_cast_fp16")];
+            tensor<int32, [4]> var_5034_begin_0 = const()[name = string("op_5034_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5034_end_0 = const()[name = string("op_5034_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5034_end_mask_0 = const()[name = string("op_5034_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5034_cast_fp16 = slice_by_index(begin = var_5034_begin_0, end = var_5034_end_0, end_mask = var_5034_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5034_cast_fp16")];
+            tensor<int32, [4]> var_5046_begin_0 = const()[name = string("op_5046_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5046_end_0 = const()[name = string("op_5046_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5046_end_mask_0 = const()[name = string("op_5046_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5046_cast_fp16 = slice_by_index(begin = var_5046_begin_0, end = var_5046_end_0, end_mask = var_5046_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5046_cast_fp16")];
+            tensor<int32, [4]> var_5050_begin_0 = const()[name = string("op_5050_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5050_end_0 = const()[name = string("op_5050_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5050_end_mask_0 = const()[name = string("op_5050_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5050_cast_fp16 = slice_by_index(begin = var_5050_begin_0, end = var_5050_end_0, end_mask = var_5050_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5050_cast_fp16")];
+            tensor<int32, [4]> var_5062_begin_0 = const()[name = string("op_5062_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5062_end_0 = const()[name = string("op_5062_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5062_end_mask_0 = const()[name = string("op_5062_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5062_cast_fp16 = slice_by_index(begin = var_5062_begin_0, end = var_5062_end_0, end_mask = var_5062_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5062_cast_fp16")];
+            tensor<int32, [4]> var_5066_begin_0 = const()[name = string("op_5066_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5066_end_0 = const()[name = string("op_5066_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5066_end_mask_0 = const()[name = string("op_5066_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5066_cast_fp16 = slice_by_index(begin = var_5066_begin_0, end = var_5066_end_0, end_mask = var_5066_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5066_cast_fp16")];
+            tensor<int32, [4]> var_5078_begin_0 = const()[name = string("op_5078_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5078_end_0 = const()[name = string("op_5078_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5078_end_mask_0 = const()[name = string("op_5078_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5078_cast_fp16 = slice_by_index(begin = var_5078_begin_0, end = var_5078_end_0, end_mask = var_5078_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5078_cast_fp16")];
+            tensor<int32, [4]> var_5082_begin_0 = const()[name = string("op_5082_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5082_end_0 = const()[name = string("op_5082_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5082_end_mask_0 = const()[name = string("op_5082_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5082_cast_fp16 = slice_by_index(begin = var_5082_begin_0, end = var_5082_end_0, end_mask = var_5082_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5082_cast_fp16")];
+            tensor<int32, [4]> var_5094_begin_0 = const()[name = string("op_5094_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5094_end_0 = const()[name = string("op_5094_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5094_end_mask_0 = const()[name = string("op_5094_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5094_cast_fp16 = slice_by_index(begin = var_5094_begin_0, end = var_5094_end_0, end_mask = var_5094_end_mask_0, x = key_heads_49_cast_fp16)[name = string("op_5094_cast_fp16")];
+            tensor<int32, [4]> var_5098_begin_0 = const()[name = string("op_5098_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5098_end_0 = const()[name = string("op_5098_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5098_end_mask_0 = const()[name = string("op_5098_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5098_cast_fp16 = slice_by_index(begin = var_5098_begin_0, end = var_5098_end_0, end_mask = var_5098_end_mask_0, x = value_heads_49_cast_fp16)[name = string("op_5098_cast_fp16")];
+            bool key_heads_51_interleave_0 = const()[name = string("key_heads_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_51_cast_fp16 = concat(axis = var_4824, interleave = key_heads_51_interleave_0, values = (var_4982_cast_fp16, var_4982_cast_fp16, var_4998_cast_fp16, var_4998_cast_fp16, var_5014_cast_fp16, var_5014_cast_fp16, var_5030_cast_fp16, var_5030_cast_fp16, var_5046_cast_fp16, var_5046_cast_fp16, var_5062_cast_fp16, var_5062_cast_fp16, var_5078_cast_fp16, var_5078_cast_fp16, var_5094_cast_fp16, var_5094_cast_fp16))[name = string("key_heads_51_cast_fp16")];
+            bool value_heads_51_interleave_0 = const()[name = string("value_heads_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_51_cast_fp16 = concat(axis = var_4824, interleave = value_heads_51_interleave_0, values = (var_4986_cast_fp16, var_4986_cast_fp16, var_5002_cast_fp16, var_5002_cast_fp16, var_5018_cast_fp16, var_5018_cast_fp16, var_5034_cast_fp16, var_5034_cast_fp16, var_5050_cast_fp16, var_5050_cast_fp16, var_5066_cast_fp16, var_5066_cast_fp16, var_5082_cast_fp16, var_5082_cast_fp16, var_5098_cast_fp16, var_5098_cast_fp16))[name = string("value_heads_51_cast_fp16")];
+            fp16 var_5121_to_fp16 = const()[name = string("op_5121_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_5122_cast_fp16 = mul(x = mh_q_75_cast_fp16, y = var_5121_to_fp16)[name = string("op_5122_cast_fp16")];
+            bool mh_w_49_transpose_x_0 = const()[name = string("mh_w_49_transpose_x_0"), val = bool(true)];
+            bool mh_w_49_transpose_y_0 = const()[name = string("mh_w_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_49_cast_fp16 = matmul(transpose_x = mh_w_49_transpose_x_0, transpose_y = mh_w_49_transpose_y_0, x = var_5122_cast_fp16, y = key_heads_51_cast_fp16)[name = string("mh_w_49_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_51_cast_fp16 = add(x = mh_w_49_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_51_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_5134_cast_fp16 = softmax(axis = var_4806, x = mh_w_51_cast_fp16)[name = string("op_5134_cast_fp16")];
+            bool attn_25_transpose_x_0 = const()[name = string("attn_25_transpose_x_0"), val = bool(false)];
+            bool attn_25_transpose_y_0 = const()[name = string("attn_25_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_25_cast_fp16 = matmul(transpose_x = attn_25_transpose_x_0, transpose_y = attn_25_transpose_y_0, x = value_heads_51_cast_fp16, y = var_5134_cast_fp16)[name = string("attn_25_cast_fp16")];
+            tensor<int32, [4]> var_5139 = const()[name = string("op_5139"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_97_cast_fp16 = reshape(shape = var_5139, x = attn_25_cast_fp16)[name = string("input_97_cast_fp16")];
+            string obj_107_pad_type_0 = const()[name = string("obj_107_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_107_strides_0 = const()[name = string("obj_107_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_107_pad_0 = const()[name = string("obj_107_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_107_dilations_0 = const()[name = string("obj_107_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_107_groups_0 = const()[name = string("obj_107_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_12_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(612609920))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(616804288))))[name = string("layers_12_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_107_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_107_dilations_0, groups = obj_107_groups_0, pad = obj_107_pad_0, pad_type = obj_107_pad_type_0, strides = obj_107_strides_0, weight = layers_12_self_attn_o_proj_weight_to_fp16_palettized, x = input_97_cast_fp16)[name = string("obj_107_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_101_cast_fp16 = add(x = inputs_95_cast_fp16, y = obj_107_cast_fp16)[name = string("inputs_101_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_103_cast_fp16 = mul(x = inputs_101_cast_fp16, y = inputs_101_cast_fp16)[name = string("inputs_sq_103_cast_fp16")];
+            tensor<int32, [1]> variance_103_axes_0 = const()[name = string("variance_103_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_103_keep_dims_0 = const()[name = string("variance_103_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_103_cast_fp16 = reduce_mean(axes = variance_103_axes_0, keep_dims = variance_103_keep_dims_0, x = inputs_sq_103_cast_fp16)[name = string("variance_103_cast_fp16")];
+            fp16 var_5157_to_fp16 = const()[name = string("op_5157_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5158_cast_fp16 = add(x = variance_103_cast_fp16, y = var_5157_to_fp16)[name = string("op_5158_cast_fp16")];
+            fp32 var_5159_epsilon_0 = const()[name = string("op_5159_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5159_cast_fp16 = rsqrt(epsilon = var_5159_epsilon_0, x = var_5158_cast_fp16)[name = string("op_5159_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_127_cast_fp16 = mul(x = inputs_101_cast_fp16, y = var_5159_cast_fp16)[name = string("hidden_states_127_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_103_to_fp16 = const()[name = string("w_103_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(616804864)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_99_cast_fp16 = mul(x = w_103_to_fp16, y = hidden_states_127_cast_fp16)[name = string("input_99_cast_fp16")];
+            string input_101_pad_type_0 = const()[name = string("input_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_101_strides_0 = const()[name = string("input_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_101_pad_0 = const()[name = string("input_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_101_dilations_0 = const()[name = string("input_101_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_101_groups_0 = const()[name = string("input_101_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_12_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(616809024))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(629392000))))[name = string("layers_12_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_101_cast_fp16 = conv(dilations = input_101_dilations_0, groups = input_101_groups_0, pad = input_101_pad_0, pad_type = input_101_pad_type_0, strides = input_101_strides_0, weight = layers_12_mlp_gate_proj_weight_to_fp16_palettized, x = input_99_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_5173_cast_fp16 = silu(x = input_101_cast_fp16)[name = string("op_5173_cast_fp16")];
+            string var_5179_pad_type_0 = const()[name = string("op_5179_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5179_strides_0 = const()[name = string("op_5179_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5179_pad_0 = const()[name = string("op_5179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5179_dilations_0 = const()[name = string("op_5179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5179_groups_0 = const()[name = string("op_5179_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_12_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(629392576))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(641975552))))[name = string("layers_12_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_5179_cast_fp16 = conv(dilations = var_5179_dilations_0, groups = var_5179_groups_0, pad = var_5179_pad_0, pad_type = var_5179_pad_type_0, strides = var_5179_strides_0, weight = layers_12_mlp_up_proj_weight_to_fp16_palettized, x = input_99_cast_fp16)[name = string("op_5179_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_103_cast_fp16 = mul(x = var_5173_cast_fp16, y = var_5179_cast_fp16)[name = string("input_103_cast_fp16")];
+            string hidden_states_129_pad_type_0 = const()[name = string("hidden_states_129_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_129_strides_0 = const()[name = string("hidden_states_129_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_129_pad_0 = const()[name = string("hidden_states_129_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_129_dilations_0 = const()[name = string("hidden_states_129_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_129_groups_0 = const()[name = string("hidden_states_129_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_12_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(641976128))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(654559104))))[name = string("layers_12_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_129_cast_fp16 = conv(dilations = hidden_states_129_dilations_0, groups = hidden_states_129_groups_0, pad = hidden_states_129_pad_0, pad_type = hidden_states_129_pad_type_0, strides = hidden_states_129_strides_0, weight = layers_12_mlp_down_proj_weight_to_fp16_palettized, x = input_103_cast_fp16)[name = string("hidden_states_129_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_103_cast_fp16 = add(x = inputs_101_cast_fp16, y = hidden_states_129_cast_fp16)[name = string("inputs_103_cast_fp16")];
+            int32 var_5193 = const()[name = string("op_5193"), val = int32(3)];
+            int32 var_5203 = const()[name = string("op_5203"), val = int32(-2)];
+            int32 var_5211 = const()[name = string("op_5211"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_105_cast_fp16 = mul(x = inputs_103_cast_fp16, y = inputs_103_cast_fp16)[name = string("inputs_sq_105_cast_fp16")];
+            tensor<int32, [1]> variance_105_axes_0 = const()[name = string("variance_105_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_105_keep_dims_0 = const()[name = string("variance_105_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_105_cast_fp16 = reduce_mean(axes = variance_105_axes_0, keep_dims = variance_105_keep_dims_0, x = inputs_sq_105_cast_fp16)[name = string("variance_105_cast_fp16")];
+            fp16 var_5223_to_fp16 = const()[name = string("op_5223_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5224_cast_fp16 = add(x = variance_105_cast_fp16, y = var_5223_to_fp16)[name = string("op_5224_cast_fp16")];
+            fp32 var_5225_epsilon_0 = const()[name = string("op_5225_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5225_cast_fp16 = rsqrt(epsilon = var_5225_epsilon_0, x = var_5224_cast_fp16)[name = string("op_5225_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_131_cast_fp16 = mul(x = inputs_103_cast_fp16, y = var_5225_cast_fp16)[name = string("hidden_states_131_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_105_to_fp16 = const()[name = string("w_105_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(654559680)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_109_cast_fp16 = mul(x = w_105_to_fp16, y = hidden_states_131_cast_fp16)[name = string("obj_109_cast_fp16")];
+            string query_79_pad_type_0 = const()[name = string("query_79_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_79_strides_0 = const()[name = string("query_79_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_79_pad_0 = const()[name = string("query_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_79_dilations_0 = const()[name = string("query_79_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_79_groups_0 = const()[name = string("query_79_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_13_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(654563840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(658758208))))[name = string("layers_13_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_79_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_79_dilations_0, groups = query_79_groups_0, pad = query_79_pad_0, pad_type = query_79_pad_type_0, strides = query_79_strides_0, weight = layers_13_self_attn_q_proj_weight_to_fp16_palettized, x = obj_109_cast_fp16)[name = string("query_79_cast_fp16")];
+            string current_key_53_pad_type_0 = const()[name = string("current_key_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_53_strides_0 = const()[name = string("current_key_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_53_pad_0 = const()[name = string("current_key_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_53_dilations_0 = const()[name = string("current_key_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_53_groups_0 = const()[name = string("current_key_53_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_13_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(658758784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(660856000))))[name = string("layers_13_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_53_cast_fp16 = conv(dilations = current_key_53_dilations_0, groups = current_key_53_groups_0, pad = current_key_53_pad_0, pad_type = current_key_53_pad_type_0, strides = current_key_53_strides_0, weight = layers_13_self_attn_k_proj_weight_to_fp16_palettized, x = obj_109_cast_fp16)[name = string("current_key_53_cast_fp16")];
+            string current_value_27_pad_type_0 = const()[name = string("current_value_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_27_strides_0 = const()[name = string("current_value_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_27_pad_0 = const()[name = string("current_value_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_27_dilations_0 = const()[name = string("current_value_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_27_groups_0 = const()[name = string("current_value_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_13_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(660856576))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(662953792))))[name = string("layers_13_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_27_dilations_0, groups = current_value_27_groups_0, pad = current_value_27_pad_0, pad_type = current_value_27_pad_type_0, strides = current_value_27_strides_0, weight = layers_13_self_attn_v_proj_weight_to_fp16_palettized, x = obj_109_cast_fp16)[name = string("current_value_27_cast_fp16")];
+            tensor<int32, [4]> var_5262 = const()[name = string("op_5262"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_105_cast_fp16 = reshape(shape = var_5262, x = query_79_cast_fp16)[name = string("inputs_105_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_107_cast_fp16 = mul(x = inputs_105_cast_fp16, y = inputs_105_cast_fp16)[name = string("inputs_sq_107_cast_fp16")];
+            tensor<int32, [1]> variance_107_axes_0 = const()[name = string("variance_107_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_107_keep_dims_0 = const()[name = string("variance_107_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_107_cast_fp16 = reduce_mean(axes = variance_107_axes_0, keep_dims = variance_107_keep_dims_0, x = inputs_sq_107_cast_fp16)[name = string("variance_107_cast_fp16")];
+            fp16 var_5268_to_fp16 = const()[name = string("op_5268_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_5269_cast_fp16 = add(x = variance_107_cast_fp16, y = var_5268_to_fp16)[name = string("op_5269_cast_fp16")];
+            fp32 var_5270_epsilon_0 = const()[name = string("op_5270_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_5270_cast_fp16 = rsqrt(epsilon = var_5270_epsilon_0, x = var_5269_cast_fp16)[name = string("op_5270_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_133_cast_fp16 = mul(x = inputs_105_cast_fp16, y = var_5270_cast_fp16)[name = string("hidden_states_133_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_107_to_fp16 = const()[name = string("w_107_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(662954368)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_27_cast_fp16 = mul(x = w_107_to_fp16, y = hidden_states_133_cast_fp16)[name = string("query_normed_27_cast_fp16")];
+            tensor<int32, [4]> var_5278 = const()[name = string("op_5278"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_107_cast_fp16 = reshape(shape = var_5278, x = current_key_53_cast_fp16)[name = string("inputs_107_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_109_cast_fp16 = mul(x = inputs_107_cast_fp16, y = inputs_107_cast_fp16)[name = string("inputs_sq_109_cast_fp16")];
+            tensor<int32, [1]> variance_109_axes_0 = const()[name = string("variance_109_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_109_keep_dims_0 = const()[name = string("variance_109_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_109_cast_fp16 = reduce_mean(axes = variance_109_axes_0, keep_dims = variance_109_keep_dims_0, x = inputs_sq_109_cast_fp16)[name = string("variance_109_cast_fp16")];
+            fp16 var_5284_to_fp16 = const()[name = string("op_5284_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_5285_cast_fp16 = add(x = variance_109_cast_fp16, y = var_5284_to_fp16)[name = string("op_5285_cast_fp16")];
+            fp32 var_5286_epsilon_0 = const()[name = string("op_5286_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_5286_cast_fp16 = rsqrt(epsilon = var_5286_epsilon_0, x = var_5285_cast_fp16)[name = string("op_5286_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_135_cast_fp16 = mul(x = inputs_107_cast_fp16, y = var_5286_cast_fp16)[name = string("hidden_states_135_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_109_to_fp16 = const()[name = string("w_109_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(662954688)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_27_cast_fp16 = mul(x = w_109_to_fp16, y = hidden_states_135_cast_fp16)[name = string("current_key_normed_27_cast_fp16")];
+            tensor<int32, [4]> var_5304 = const()[name = string("op_5304"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_79_cast_fp16 = reshape(shape = var_5304, x = query_normed_27_cast_fp16)[name = string("mh_q_79_cast_fp16")];
+            tensor<int32, [4]> var_5306 = const()[name = string("op_5306"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_53_cast_fp16 = reshape(shape = var_5306, x = current_key_normed_27_cast_fp16)[name = string("mh_k_53_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5310_cast_fp16 = mul(x = mh_q_79_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5310_cast_fp16")];
+            tensor<int32, [4]> var_5315_begin_0 = const()[name = string("op_5315_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5315_end_0 = const()[name = string("op_5315_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_5315_end_mask_0 = const()[name = string("op_5315_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5315_cast_fp16 = slice_by_index(begin = var_5315_begin_0, end = var_5315_end_0, end_mask = var_5315_end_mask_0, x = mh_q_79_cast_fp16)[name = string("op_5315_cast_fp16")];
+            tensor<int32, [4]> var_5321_begin_0 = const()[name = string("op_5321_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5321_end_0 = const()[name = string("op_5321_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_5321_end_mask_0 = const()[name = string("op_5321_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5321_cast_fp16 = slice_by_index(begin = var_5321_begin_0, end = var_5321_end_0, end_mask = var_5321_end_mask_0, x = mh_q_79_cast_fp16)[name = string("op_5321_cast_fp16")];
+            fp16 const_316_promoted_to_fp16 = const()[name = string("const_316_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_5323_cast_fp16 = mul(x = var_5321_cast_fp16, y = const_316_promoted_to_fp16)[name = string("op_5323_cast_fp16")];
+            bool var_5325_interleave_0 = const()[name = string("op_5325_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_5325_cast_fp16 = concat(axis = var_5203, interleave = var_5325_interleave_0, values = (var_5323_cast_fp16, var_5315_cast_fp16))[name = string("op_5325_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5326_cast_fp16 = mul(x = var_5325_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5326_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_81_cast_fp16 = add(x = var_5310_cast_fp16, y = var_5326_cast_fp16)[name = string("mh_q_81_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5328_cast_fp16 = mul(x = mh_k_53_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5328_cast_fp16")];
+            tensor<int32, [4]> var_5333_begin_0 = const()[name = string("op_5333_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5333_end_0 = const()[name = string("op_5333_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_5333_end_mask_0 = const()[name = string("op_5333_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5333_cast_fp16 = slice_by_index(begin = var_5333_begin_0, end = var_5333_end_0, end_mask = var_5333_end_mask_0, x = mh_k_53_cast_fp16)[name = string("op_5333_cast_fp16")];
+            tensor<int32, [4]> var_5339_begin_0 = const()[name = string("op_5339_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5339_end_0 = const()[name = string("op_5339_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_5339_end_mask_0 = const()[name = string("op_5339_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5339_cast_fp16 = slice_by_index(begin = var_5339_begin_0, end = var_5339_end_0, end_mask = var_5339_end_mask_0, x = mh_k_53_cast_fp16)[name = string("op_5339_cast_fp16")];
+            fp16 const_319_promoted_to_fp16 = const()[name = string("const_319_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_5341_cast_fp16 = mul(x = var_5339_cast_fp16, y = const_319_promoted_to_fp16)[name = string("op_5341_cast_fp16")];
+            bool var_5343_interleave_0 = const()[name = string("op_5343_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_5343_cast_fp16 = concat(axis = var_5203, interleave = var_5343_interleave_0, values = (var_5341_cast_fp16, var_5333_cast_fp16))[name = string("op_5343_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5344_cast_fp16 = mul(x = var_5343_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5344_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_55_cast_fp16 = add(x = var_5328_cast_fp16, y = var_5344_cast_fp16)[name = string("mh_k_55_cast_fp16")];
+            tensor<int32, [4]> var_5348 = const()[name = string("op_5348"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_55_cast_fp16 = reshape(shape = var_5348, x = mh_k_55_cast_fp16)[name = string("current_key_55_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5355_cast_fp16 = mul(x = var_101_cast_fp16_13, y = var_323_cast_fp16)[name = string("op_5355_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5356_cast_fp16 = mul(x = current_key_55_cast_fp16, y = var_321_cast_fp16)[name = string("op_5356_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_81_cast_fp16 = add(x = var_5355_cast_fp16, y = var_5356_cast_fp16)[name = string("key_81_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5359_cast_fp16 = mul(x = var_132_cast_fp16_13, y = var_323_cast_fp16)[name = string("op_5359_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5360_cast_fp16 = mul(x = current_value_27_cast_fp16, y = var_321_cast_fp16)[name = string("op_5360_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_53_cast_fp16 = add(x = var_5359_cast_fp16, y = var_5360_cast_fp16)[name = string("value_53_cast_fp16")];
+            tensor<int32, [4]> var_5364 = const()[name = string("op_5364"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_53_cast_fp16 = reshape(shape = var_5364, x = key_81_cast_fp16)[name = string("key_heads_53_cast_fp16")];
+            tensor<int32, [4]> var_5366 = const()[name = string("op_5366"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_53_cast_fp16 = reshape(shape = var_5366, x = value_53_cast_fp16)[name = string("value_heads_53_cast_fp16")];
+            tensor<int32, [4]> var_5369_begin_0 = const()[name = string("op_5369_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5369_end_0 = const()[name = string("op_5369_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5369_end_mask_0 = const()[name = string("op_5369_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5369_cast_fp16 = slice_by_index(begin = var_5369_begin_0, end = var_5369_end_0, end_mask = var_5369_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5369_cast_fp16")];
+            tensor<int32, [4]> var_5373_begin_0 = const()[name = string("op_5373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5373_end_0 = const()[name = string("op_5373_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5373_end_mask_0 = const()[name = string("op_5373_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5373_cast_fp16 = slice_by_index(begin = var_5373_begin_0, end = var_5373_end_0, end_mask = var_5373_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5373_cast_fp16")];
+            tensor<int32, [4]> var_5385_begin_0 = const()[name = string("op_5385_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5385_end_0 = const()[name = string("op_5385_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5385_end_mask_0 = const()[name = string("op_5385_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5385_cast_fp16 = slice_by_index(begin = var_5385_begin_0, end = var_5385_end_0, end_mask = var_5385_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5385_cast_fp16")];
+            tensor<int32, [4]> var_5389_begin_0 = const()[name = string("op_5389_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5389_end_0 = const()[name = string("op_5389_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5389_end_mask_0 = const()[name = string("op_5389_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5389_cast_fp16 = slice_by_index(begin = var_5389_begin_0, end = var_5389_end_0, end_mask = var_5389_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5389_cast_fp16")];
+            tensor<int32, [4]> var_5401_begin_0 = const()[name = string("op_5401_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5401_end_0 = const()[name = string("op_5401_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5401_end_mask_0 = const()[name = string("op_5401_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5401_cast_fp16 = slice_by_index(begin = var_5401_begin_0, end = var_5401_end_0, end_mask = var_5401_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5401_cast_fp16")];
+            tensor<int32, [4]> var_5405_begin_0 = const()[name = string("op_5405_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5405_end_0 = const()[name = string("op_5405_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5405_end_mask_0 = const()[name = string("op_5405_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5405_cast_fp16 = slice_by_index(begin = var_5405_begin_0, end = var_5405_end_0, end_mask = var_5405_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5405_cast_fp16")];
+            tensor<int32, [4]> var_5417_begin_0 = const()[name = string("op_5417_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5417_end_0 = const()[name = string("op_5417_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5417_end_mask_0 = const()[name = string("op_5417_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5417_cast_fp16 = slice_by_index(begin = var_5417_begin_0, end = var_5417_end_0, end_mask = var_5417_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5417_cast_fp16")];
+            tensor<int32, [4]> var_5421_begin_0 = const()[name = string("op_5421_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5421_end_0 = const()[name = string("op_5421_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5421_end_mask_0 = const()[name = string("op_5421_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5421_cast_fp16 = slice_by_index(begin = var_5421_begin_0, end = var_5421_end_0, end_mask = var_5421_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5421_cast_fp16")];
+            tensor<int32, [4]> var_5433_begin_0 = const()[name = string("op_5433_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5433_end_0 = const()[name = string("op_5433_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5433_end_mask_0 = const()[name = string("op_5433_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5433_cast_fp16 = slice_by_index(begin = var_5433_begin_0, end = var_5433_end_0, end_mask = var_5433_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5433_cast_fp16")];
+            tensor<int32, [4]> var_5437_begin_0 = const()[name = string("op_5437_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5437_end_0 = const()[name = string("op_5437_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5437_end_mask_0 = const()[name = string("op_5437_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5437_cast_fp16 = slice_by_index(begin = var_5437_begin_0, end = var_5437_end_0, end_mask = var_5437_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5437_cast_fp16")];
+            tensor<int32, [4]> var_5449_begin_0 = const()[name = string("op_5449_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5449_end_0 = const()[name = string("op_5449_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5449_end_mask_0 = const()[name = string("op_5449_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5449_cast_fp16 = slice_by_index(begin = var_5449_begin_0, end = var_5449_end_0, end_mask = var_5449_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5449_cast_fp16")];
+            tensor<int32, [4]> var_5453_begin_0 = const()[name = string("op_5453_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5453_end_0 = const()[name = string("op_5453_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5453_end_mask_0 = const()[name = string("op_5453_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5453_cast_fp16 = slice_by_index(begin = var_5453_begin_0, end = var_5453_end_0, end_mask = var_5453_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5453_cast_fp16")];
+            tensor<int32, [4]> var_5465_begin_0 = const()[name = string("op_5465_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5465_end_0 = const()[name = string("op_5465_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5465_end_mask_0 = const()[name = string("op_5465_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5465_cast_fp16 = slice_by_index(begin = var_5465_begin_0, end = var_5465_end_0, end_mask = var_5465_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5465_cast_fp16")];
+            tensor<int32, [4]> var_5469_begin_0 = const()[name = string("op_5469_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5469_end_0 = const()[name = string("op_5469_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5469_end_mask_0 = const()[name = string("op_5469_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5469_cast_fp16 = slice_by_index(begin = var_5469_begin_0, end = var_5469_end_0, end_mask = var_5469_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5469_cast_fp16")];
+            tensor<int32, [4]> var_5481_begin_0 = const()[name = string("op_5481_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5481_end_0 = const()[name = string("op_5481_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5481_end_mask_0 = const()[name = string("op_5481_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5481_cast_fp16 = slice_by_index(begin = var_5481_begin_0, end = var_5481_end_0, end_mask = var_5481_end_mask_0, x = key_heads_53_cast_fp16)[name = string("op_5481_cast_fp16")];
+            tensor<int32, [4]> var_5485_begin_0 = const()[name = string("op_5485_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5485_end_0 = const()[name = string("op_5485_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5485_end_mask_0 = const()[name = string("op_5485_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5485_cast_fp16 = slice_by_index(begin = var_5485_begin_0, end = var_5485_end_0, end_mask = var_5485_end_mask_0, x = value_heads_53_cast_fp16)[name = string("op_5485_cast_fp16")];
+            bool key_heads_55_interleave_0 = const()[name = string("key_heads_55_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_55_cast_fp16 = concat(axis = var_5211, interleave = key_heads_55_interleave_0, values = (var_5369_cast_fp16, var_5369_cast_fp16, var_5385_cast_fp16, var_5385_cast_fp16, var_5401_cast_fp16, var_5401_cast_fp16, var_5417_cast_fp16, var_5417_cast_fp16, var_5433_cast_fp16, var_5433_cast_fp16, var_5449_cast_fp16, var_5449_cast_fp16, var_5465_cast_fp16, var_5465_cast_fp16, var_5481_cast_fp16, var_5481_cast_fp16))[name = string("key_heads_55_cast_fp16")];
+            bool value_heads_55_interleave_0 = const()[name = string("value_heads_55_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_55_cast_fp16 = concat(axis = var_5211, interleave = value_heads_55_interleave_0, values = (var_5373_cast_fp16, var_5373_cast_fp16, var_5389_cast_fp16, var_5389_cast_fp16, var_5405_cast_fp16, var_5405_cast_fp16, var_5421_cast_fp16, var_5421_cast_fp16, var_5437_cast_fp16, var_5437_cast_fp16, var_5453_cast_fp16, var_5453_cast_fp16, var_5469_cast_fp16, var_5469_cast_fp16, var_5485_cast_fp16, var_5485_cast_fp16))[name = string("value_heads_55_cast_fp16")];
+            fp16 var_5508_to_fp16 = const()[name = string("op_5508_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_5509_cast_fp16 = mul(x = mh_q_81_cast_fp16, y = var_5508_to_fp16)[name = string("op_5509_cast_fp16")];
+            bool mh_w_53_transpose_x_0 = const()[name = string("mh_w_53_transpose_x_0"), val = bool(true)];
+            bool mh_w_53_transpose_y_0 = const()[name = string("mh_w_53_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_53_cast_fp16 = matmul(transpose_x = mh_w_53_transpose_x_0, transpose_y = mh_w_53_transpose_y_0, x = var_5509_cast_fp16, y = key_heads_55_cast_fp16)[name = string("mh_w_53_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_55_cast_fp16 = add(x = mh_w_53_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_55_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_5521_cast_fp16 = softmax(axis = var_5193, x = mh_w_55_cast_fp16)[name = string("op_5521_cast_fp16")];
+            bool attn_27_transpose_x_0 = const()[name = string("attn_27_transpose_x_0"), val = bool(false)];
+            bool attn_27_transpose_y_0 = const()[name = string("attn_27_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_27_cast_fp16 = matmul(transpose_x = attn_27_transpose_x_0, transpose_y = attn_27_transpose_y_0, x = value_heads_55_cast_fp16, y = var_5521_cast_fp16)[name = string("attn_27_cast_fp16")];
+            tensor<int32, [4]> var_5526 = const()[name = string("op_5526"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_105_cast_fp16 = reshape(shape = var_5526, x = attn_27_cast_fp16)[name = string("input_105_cast_fp16")];
+            string obj_115_pad_type_0 = const()[name = string("obj_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_115_strides_0 = const()[name = string("obj_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_115_pad_0 = const()[name = string("obj_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_115_dilations_0 = const()[name = string("obj_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_115_groups_0 = const()[name = string("obj_115_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_13_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(662955008))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(667149376))))[name = string("layers_13_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_115_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_115_dilations_0, groups = obj_115_groups_0, pad = obj_115_pad_0, pad_type = obj_115_pad_type_0, strides = obj_115_strides_0, weight = layers_13_self_attn_o_proj_weight_to_fp16_palettized, x = input_105_cast_fp16)[name = string("obj_115_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_109_cast_fp16 = add(x = inputs_103_cast_fp16, y = obj_115_cast_fp16)[name = string("inputs_109_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_111_cast_fp16 = mul(x = inputs_109_cast_fp16, y = inputs_109_cast_fp16)[name = string("inputs_sq_111_cast_fp16")];
+            tensor<int32, [1]> variance_111_axes_0 = const()[name = string("variance_111_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_111_keep_dims_0 = const()[name = string("variance_111_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_111_cast_fp16 = reduce_mean(axes = variance_111_axes_0, keep_dims = variance_111_keep_dims_0, x = inputs_sq_111_cast_fp16)[name = string("variance_111_cast_fp16")];
+            fp16 var_5544_to_fp16 = const()[name = string("op_5544_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5545_cast_fp16 = add(x = variance_111_cast_fp16, y = var_5544_to_fp16)[name = string("op_5545_cast_fp16")];
+            fp32 var_5546_epsilon_0 = const()[name = string("op_5546_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5546_cast_fp16 = rsqrt(epsilon = var_5546_epsilon_0, x = var_5545_cast_fp16)[name = string("op_5546_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_137_cast_fp16 = mul(x = inputs_109_cast_fp16, y = var_5546_cast_fp16)[name = string("hidden_states_137_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_111_to_fp16 = const()[name = string("w_111_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(667149952)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_107_cast_fp16 = mul(x = w_111_to_fp16, y = hidden_states_137_cast_fp16)[name = string("input_107_cast_fp16")];
+            string input_109_pad_type_0 = const()[name = string("input_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_109_strides_0 = const()[name = string("input_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_109_pad_0 = const()[name = string("input_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_109_dilations_0 = const()[name = string("input_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_109_groups_0 = const()[name = string("input_109_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_13_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(667154112))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(679737088))))[name = string("layers_13_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_109_cast_fp16 = conv(dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = layers_13_mlp_gate_proj_weight_to_fp16_palettized, x = input_107_cast_fp16)[name = string("input_109_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_5560_cast_fp16 = silu(x = input_109_cast_fp16)[name = string("op_5560_cast_fp16")];
+            string var_5566_pad_type_0 = const()[name = string("op_5566_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5566_strides_0 = const()[name = string("op_5566_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5566_pad_0 = const()[name = string("op_5566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5566_dilations_0 = const()[name = string("op_5566_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5566_groups_0 = const()[name = string("op_5566_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_13_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(679737664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(692320640))))[name = string("layers_13_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_5566_cast_fp16 = conv(dilations = var_5566_dilations_0, groups = var_5566_groups_0, pad = var_5566_pad_0, pad_type = var_5566_pad_type_0, strides = var_5566_strides_0, weight = layers_13_mlp_up_proj_weight_to_fp16_palettized, x = input_107_cast_fp16)[name = string("op_5566_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_111_cast_fp16 = mul(x = var_5560_cast_fp16, y = var_5566_cast_fp16)[name = string("input_111_cast_fp16")];
+            string hidden_states_139_pad_type_0 = const()[name = string("hidden_states_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_139_strides_0 = const()[name = string("hidden_states_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_139_pad_0 = const()[name = string("hidden_states_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_139_dilations_0 = const()[name = string("hidden_states_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_139_groups_0 = const()[name = string("hidden_states_139_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_13_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(692321216))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(704904192))))[name = string("layers_13_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_139_cast_fp16 = conv(dilations = hidden_states_139_dilations_0, groups = hidden_states_139_groups_0, pad = hidden_states_139_pad_0, pad_type = hidden_states_139_pad_type_0, strides = hidden_states_139_strides_0, weight = layers_13_mlp_down_proj_weight_to_fp16_palettized, x = input_111_cast_fp16)[name = string("hidden_states_139_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_111_cast_fp16 = add(x = inputs_109_cast_fp16, y = hidden_states_139_cast_fp16)[name = string("inputs_111_cast_fp16")];
+            int32 var_5580 = const()[name = string("op_5580"), val = int32(3)];
+            int32 var_5590 = const()[name = string("op_5590"), val = int32(-2)];
+            int32 var_5598 = const()[name = string("op_5598"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_113_cast_fp16 = mul(x = inputs_111_cast_fp16, y = inputs_111_cast_fp16)[name = string("inputs_sq_113_cast_fp16")];
+            tensor<int32, [1]> variance_113_axes_0 = const()[name = string("variance_113_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_113_keep_dims_0 = const()[name = string("variance_113_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_113_cast_fp16 = reduce_mean(axes = variance_113_axes_0, keep_dims = variance_113_keep_dims_0, x = inputs_sq_113_cast_fp16)[name = string("variance_113_cast_fp16")];
+            fp16 var_5610_to_fp16 = const()[name = string("op_5610_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5611_cast_fp16 = add(x = variance_113_cast_fp16, y = var_5610_to_fp16)[name = string("op_5611_cast_fp16")];
+            fp32 var_5612_epsilon_0 = const()[name = string("op_5612_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5612_cast_fp16 = rsqrt(epsilon = var_5612_epsilon_0, x = var_5611_cast_fp16)[name = string("op_5612_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_141_cast_fp16 = mul(x = inputs_111_cast_fp16, y = var_5612_cast_fp16)[name = string("hidden_states_141_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_113_to_fp16 = const()[name = string("w_113_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(704904768)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_117_cast_fp16 = mul(x = w_113_to_fp16, y = hidden_states_141_cast_fp16)[name = string("obj_117_cast_fp16")];
+            string query_85_pad_type_0 = const()[name = string("query_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_85_strides_0 = const()[name = string("query_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_85_pad_0 = const()[name = string("query_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_85_dilations_0 = const()[name = string("query_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_85_groups_0 = const()[name = string("query_85_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_14_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(704908928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(709103296))))[name = string("layers_14_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_85_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_85_dilations_0, groups = query_85_groups_0, pad = query_85_pad_0, pad_type = query_85_pad_type_0, strides = query_85_strides_0, weight = layers_14_self_attn_q_proj_weight_to_fp16_palettized, x = obj_117_cast_fp16)[name = string("query_85_cast_fp16")];
+            string current_key_57_pad_type_0 = const()[name = string("current_key_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_57_strides_0 = const()[name = string("current_key_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_57_pad_0 = const()[name = string("current_key_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_57_dilations_0 = const()[name = string("current_key_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_57_groups_0 = const()[name = string("current_key_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_14_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(709103872))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(711201088))))[name = string("layers_14_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_57_cast_fp16 = conv(dilations = current_key_57_dilations_0, groups = current_key_57_groups_0, pad = current_key_57_pad_0, pad_type = current_key_57_pad_type_0, strides = current_key_57_strides_0, weight = layers_14_self_attn_k_proj_weight_to_fp16_palettized, x = obj_117_cast_fp16)[name = string("current_key_57_cast_fp16")];
+            string current_value_29_pad_type_0 = const()[name = string("current_value_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_29_strides_0 = const()[name = string("current_value_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_29_pad_0 = const()[name = string("current_value_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_29_dilations_0 = const()[name = string("current_value_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_29_groups_0 = const()[name = string("current_value_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_14_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(711201664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(713298880))))[name = string("layers_14_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_29_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_29_dilations_0, groups = current_value_29_groups_0, pad = current_value_29_pad_0, pad_type = current_value_29_pad_type_0, strides = current_value_29_strides_0, weight = layers_14_self_attn_v_proj_weight_to_fp16_palettized, x = obj_117_cast_fp16)[name = string("current_value_29_cast_fp16")];
+            tensor<int32, [4]> var_5649 = const()[name = string("op_5649"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_113_cast_fp16 = reshape(shape = var_5649, x = query_85_cast_fp16)[name = string("inputs_113_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_115_cast_fp16 = mul(x = inputs_113_cast_fp16, y = inputs_113_cast_fp16)[name = string("inputs_sq_115_cast_fp16")];
+            tensor<int32, [1]> variance_115_axes_0 = const()[name = string("variance_115_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_115_keep_dims_0 = const()[name = string("variance_115_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_115_cast_fp16 = reduce_mean(axes = variance_115_axes_0, keep_dims = variance_115_keep_dims_0, x = inputs_sq_115_cast_fp16)[name = string("variance_115_cast_fp16")];
+            fp16 var_5655_to_fp16 = const()[name = string("op_5655_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_5656_cast_fp16 = add(x = variance_115_cast_fp16, y = var_5655_to_fp16)[name = string("op_5656_cast_fp16")];
+            fp32 var_5657_epsilon_0 = const()[name = string("op_5657_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_5657_cast_fp16 = rsqrt(epsilon = var_5657_epsilon_0, x = var_5656_cast_fp16)[name = string("op_5657_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_143_cast_fp16 = mul(x = inputs_113_cast_fp16, y = var_5657_cast_fp16)[name = string("hidden_states_143_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_115_to_fp16 = const()[name = string("w_115_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(713299456)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_29_cast_fp16 = mul(x = w_115_to_fp16, y = hidden_states_143_cast_fp16)[name = string("query_normed_29_cast_fp16")];
+            tensor<int32, [4]> var_5665 = const()[name = string("op_5665"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_115_cast_fp16 = reshape(shape = var_5665, x = current_key_57_cast_fp16)[name = string("inputs_115_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_117_cast_fp16 = mul(x = inputs_115_cast_fp16, y = inputs_115_cast_fp16)[name = string("inputs_sq_117_cast_fp16")];
+            tensor<int32, [1]> variance_117_axes_0 = const()[name = string("variance_117_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_117_keep_dims_0 = const()[name = string("variance_117_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_117_cast_fp16 = reduce_mean(axes = variance_117_axes_0, keep_dims = variance_117_keep_dims_0, x = inputs_sq_117_cast_fp16)[name = string("variance_117_cast_fp16")];
+            fp16 var_5671_to_fp16 = const()[name = string("op_5671_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_5672_cast_fp16 = add(x = variance_117_cast_fp16, y = var_5671_to_fp16)[name = string("op_5672_cast_fp16")];
+            fp32 var_5673_epsilon_0 = const()[name = string("op_5673_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_5673_cast_fp16 = rsqrt(epsilon = var_5673_epsilon_0, x = var_5672_cast_fp16)[name = string("op_5673_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_145_cast_fp16 = mul(x = inputs_115_cast_fp16, y = var_5673_cast_fp16)[name = string("hidden_states_145_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_117_to_fp16 = const()[name = string("w_117_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(713299776)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_29_cast_fp16 = mul(x = w_117_to_fp16, y = hidden_states_145_cast_fp16)[name = string("current_key_normed_29_cast_fp16")];
+            tensor<int32, [4]> var_5691 = const()[name = string("op_5691"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_85_cast_fp16 = reshape(shape = var_5691, x = query_normed_29_cast_fp16)[name = string("mh_q_85_cast_fp16")];
+            tensor<int32, [4]> var_5693 = const()[name = string("op_5693"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_57_cast_fp16 = reshape(shape = var_5693, x = current_key_normed_29_cast_fp16)[name = string("mh_k_57_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5697_cast_fp16 = mul(x = mh_q_85_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5697_cast_fp16")];
+            tensor<int32, [4]> var_5702_begin_0 = const()[name = string("op_5702_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5702_end_0 = const()[name = string("op_5702_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_5702_end_mask_0 = const()[name = string("op_5702_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5702_cast_fp16 = slice_by_index(begin = var_5702_begin_0, end = var_5702_end_0, end_mask = var_5702_end_mask_0, x = mh_q_85_cast_fp16)[name = string("op_5702_cast_fp16")];
+            tensor<int32, [4]> var_5708_begin_0 = const()[name = string("op_5708_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5708_end_0 = const()[name = string("op_5708_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_5708_end_mask_0 = const()[name = string("op_5708_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_5708_cast_fp16 = slice_by_index(begin = var_5708_begin_0, end = var_5708_end_0, end_mask = var_5708_end_mask_0, x = mh_q_85_cast_fp16)[name = string("op_5708_cast_fp16")];
+            fp16 const_339_promoted_to_fp16 = const()[name = string("const_339_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_5710_cast_fp16 = mul(x = var_5708_cast_fp16, y = const_339_promoted_to_fp16)[name = string("op_5710_cast_fp16")];
+            bool var_5712_interleave_0 = const()[name = string("op_5712_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_5712_cast_fp16 = concat(axis = var_5590, interleave = var_5712_interleave_0, values = (var_5710_cast_fp16, var_5702_cast_fp16))[name = string("op_5712_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_5713_cast_fp16 = mul(x = var_5712_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5713_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_87_cast_fp16 = add(x = var_5697_cast_fp16, y = var_5713_cast_fp16)[name = string("mh_q_87_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5715_cast_fp16 = mul(x = mh_k_57_cast_fp16, y = cos_1_cast_fp16)[name = string("op_5715_cast_fp16")];
+            tensor<int32, [4]> var_5720_begin_0 = const()[name = string("op_5720_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5720_end_0 = const()[name = string("op_5720_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_5720_end_mask_0 = const()[name = string("op_5720_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5720_cast_fp16 = slice_by_index(begin = var_5720_begin_0, end = var_5720_end_0, end_mask = var_5720_end_mask_0, x = mh_k_57_cast_fp16)[name = string("op_5720_cast_fp16")];
+            tensor<int32, [4]> var_5726_begin_0 = const()[name = string("op_5726_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_5726_end_0 = const()[name = string("op_5726_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_5726_end_mask_0 = const()[name = string("op_5726_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_5726_cast_fp16 = slice_by_index(begin = var_5726_begin_0, end = var_5726_end_0, end_mask = var_5726_end_mask_0, x = mh_k_57_cast_fp16)[name = string("op_5726_cast_fp16")];
+            fp16 const_342_promoted_to_fp16 = const()[name = string("const_342_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_5728_cast_fp16 = mul(x = var_5726_cast_fp16, y = const_342_promoted_to_fp16)[name = string("op_5728_cast_fp16")];
+            bool var_5730_interleave_0 = const()[name = string("op_5730_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_5730_cast_fp16 = concat(axis = var_5590, interleave = var_5730_interleave_0, values = (var_5728_cast_fp16, var_5720_cast_fp16))[name = string("op_5730_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_5731_cast_fp16 = mul(x = var_5730_cast_fp16, y = sin_1_cast_fp16)[name = string("op_5731_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_59_cast_fp16 = add(x = var_5715_cast_fp16, y = var_5731_cast_fp16)[name = string("mh_k_59_cast_fp16")];
+            tensor<int32, [4]> var_5735 = const()[name = string("op_5735"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_59_cast_fp16 = reshape(shape = var_5735, x = mh_k_59_cast_fp16)[name = string("current_key_59_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5742_cast_fp16 = mul(x = var_101_cast_fp16_14, y = var_323_cast_fp16)[name = string("op_5742_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5743_cast_fp16 = mul(x = current_key_59_cast_fp16, y = var_321_cast_fp16)[name = string("op_5743_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_87_cast_fp16 = add(x = var_5742_cast_fp16, y = var_5743_cast_fp16)[name = string("key_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5746_cast_fp16 = mul(x = var_132_cast_fp16_14, y = var_323_cast_fp16)[name = string("op_5746_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_5747_cast_fp16 = mul(x = current_value_29_cast_fp16, y = var_321_cast_fp16)[name = string("op_5747_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_57_cast_fp16 = add(x = var_5746_cast_fp16, y = var_5747_cast_fp16)[name = string("value_57_cast_fp16")];
+            tensor<int32, [4]> var_5751 = const()[name = string("op_5751"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_57_cast_fp16 = reshape(shape = var_5751, x = key_87_cast_fp16)[name = string("key_heads_57_cast_fp16")];
+            tensor<int32, [4]> var_5753 = const()[name = string("op_5753"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_57_cast_fp16 = reshape(shape = var_5753, x = value_57_cast_fp16)[name = string("value_heads_57_cast_fp16")];
+            tensor<int32, [4]> var_5756_begin_0 = const()[name = string("op_5756_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5756_end_0 = const()[name = string("op_5756_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5756_end_mask_0 = const()[name = string("op_5756_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5756_cast_fp16 = slice_by_index(begin = var_5756_begin_0, end = var_5756_end_0, end_mask = var_5756_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5756_cast_fp16")];
+            tensor<int32, [4]> var_5760_begin_0 = const()[name = string("op_5760_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5760_end_0 = const()[name = string("op_5760_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5760_end_mask_0 = const()[name = string("op_5760_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5760_cast_fp16 = slice_by_index(begin = var_5760_begin_0, end = var_5760_end_0, end_mask = var_5760_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5760_cast_fp16")];
+            tensor<int32, [4]> var_5772_begin_0 = const()[name = string("op_5772_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5772_end_0 = const()[name = string("op_5772_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5772_end_mask_0 = const()[name = string("op_5772_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5772_cast_fp16 = slice_by_index(begin = var_5772_begin_0, end = var_5772_end_0, end_mask = var_5772_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5772_cast_fp16")];
+            tensor<int32, [4]> var_5776_begin_0 = const()[name = string("op_5776_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_5776_end_0 = const()[name = string("op_5776_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_5776_end_mask_0 = const()[name = string("op_5776_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5776_cast_fp16 = slice_by_index(begin = var_5776_begin_0, end = var_5776_end_0, end_mask = var_5776_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5776_cast_fp16")];
+            tensor<int32, [4]> var_5788_begin_0 = const()[name = string("op_5788_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5788_end_0 = const()[name = string("op_5788_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5788_end_mask_0 = const()[name = string("op_5788_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5788_cast_fp16 = slice_by_index(begin = var_5788_begin_0, end = var_5788_end_0, end_mask = var_5788_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5788_cast_fp16")];
+            tensor<int32, [4]> var_5792_begin_0 = const()[name = string("op_5792_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_5792_end_0 = const()[name = string("op_5792_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_5792_end_mask_0 = const()[name = string("op_5792_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5792_cast_fp16 = slice_by_index(begin = var_5792_begin_0, end = var_5792_end_0, end_mask = var_5792_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5792_cast_fp16")];
+            tensor<int32, [4]> var_5804_begin_0 = const()[name = string("op_5804_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5804_end_0 = const()[name = string("op_5804_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5804_end_mask_0 = const()[name = string("op_5804_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5804_cast_fp16 = slice_by_index(begin = var_5804_begin_0, end = var_5804_end_0, end_mask = var_5804_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5804_cast_fp16")];
+            tensor<int32, [4]> var_5808_begin_0 = const()[name = string("op_5808_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_5808_end_0 = const()[name = string("op_5808_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_5808_end_mask_0 = const()[name = string("op_5808_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5808_cast_fp16 = slice_by_index(begin = var_5808_begin_0, end = var_5808_end_0, end_mask = var_5808_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5808_cast_fp16")];
+            tensor<int32, [4]> var_5820_begin_0 = const()[name = string("op_5820_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5820_end_0 = const()[name = string("op_5820_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5820_end_mask_0 = const()[name = string("op_5820_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5820_cast_fp16 = slice_by_index(begin = var_5820_begin_0, end = var_5820_end_0, end_mask = var_5820_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5820_cast_fp16")];
+            tensor<int32, [4]> var_5824_begin_0 = const()[name = string("op_5824_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_5824_end_0 = const()[name = string("op_5824_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_5824_end_mask_0 = const()[name = string("op_5824_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5824_cast_fp16 = slice_by_index(begin = var_5824_begin_0, end = var_5824_end_0, end_mask = var_5824_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5824_cast_fp16")];
+            tensor<int32, [4]> var_5836_begin_0 = const()[name = string("op_5836_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5836_end_0 = const()[name = string("op_5836_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5836_end_mask_0 = const()[name = string("op_5836_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5836_cast_fp16 = slice_by_index(begin = var_5836_begin_0, end = var_5836_end_0, end_mask = var_5836_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5836_cast_fp16")];
+            tensor<int32, [4]> var_5840_begin_0 = const()[name = string("op_5840_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_5840_end_0 = const()[name = string("op_5840_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_5840_end_mask_0 = const()[name = string("op_5840_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5840_cast_fp16 = slice_by_index(begin = var_5840_begin_0, end = var_5840_end_0, end_mask = var_5840_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5840_cast_fp16")];
+            tensor<int32, [4]> var_5852_begin_0 = const()[name = string("op_5852_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5852_end_0 = const()[name = string("op_5852_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5852_end_mask_0 = const()[name = string("op_5852_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5852_cast_fp16 = slice_by_index(begin = var_5852_begin_0, end = var_5852_end_0, end_mask = var_5852_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5852_cast_fp16")];
+            tensor<int32, [4]> var_5856_begin_0 = const()[name = string("op_5856_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_5856_end_0 = const()[name = string("op_5856_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_5856_end_mask_0 = const()[name = string("op_5856_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5856_cast_fp16 = slice_by_index(begin = var_5856_begin_0, end = var_5856_end_0, end_mask = var_5856_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5856_cast_fp16")];
+            tensor<int32, [4]> var_5868_begin_0 = const()[name = string("op_5868_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5868_end_0 = const()[name = string("op_5868_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5868_end_mask_0 = const()[name = string("op_5868_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5868_cast_fp16 = slice_by_index(begin = var_5868_begin_0, end = var_5868_end_0, end_mask = var_5868_end_mask_0, x = key_heads_57_cast_fp16)[name = string("op_5868_cast_fp16")];
+            tensor<int32, [4]> var_5872_begin_0 = const()[name = string("op_5872_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_5872_end_0 = const()[name = string("op_5872_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_5872_end_mask_0 = const()[name = string("op_5872_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_5872_cast_fp16 = slice_by_index(begin = var_5872_begin_0, end = var_5872_end_0, end_mask = var_5872_end_mask_0, x = value_heads_57_cast_fp16)[name = string("op_5872_cast_fp16")];
+            bool key_heads_59_interleave_0 = const()[name = string("key_heads_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_59_cast_fp16 = concat(axis = var_5598, interleave = key_heads_59_interleave_0, values = (var_5756_cast_fp16, var_5756_cast_fp16, var_5772_cast_fp16, var_5772_cast_fp16, var_5788_cast_fp16, var_5788_cast_fp16, var_5804_cast_fp16, var_5804_cast_fp16, var_5820_cast_fp16, var_5820_cast_fp16, var_5836_cast_fp16, var_5836_cast_fp16, var_5852_cast_fp16, var_5852_cast_fp16, var_5868_cast_fp16, var_5868_cast_fp16))[name = string("key_heads_59_cast_fp16")];
+            bool value_heads_59_interleave_0 = const()[name = string("value_heads_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_59_cast_fp16 = concat(axis = var_5598, interleave = value_heads_59_interleave_0, values = (var_5760_cast_fp16, var_5760_cast_fp16, var_5776_cast_fp16, var_5776_cast_fp16, var_5792_cast_fp16, var_5792_cast_fp16, var_5808_cast_fp16, var_5808_cast_fp16, var_5824_cast_fp16, var_5824_cast_fp16, var_5840_cast_fp16, var_5840_cast_fp16, var_5856_cast_fp16, var_5856_cast_fp16, var_5872_cast_fp16, var_5872_cast_fp16))[name = string("value_heads_59_cast_fp16")];
+            fp16 var_5895_to_fp16 = const()[name = string("op_5895_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_5896_cast_fp16 = mul(x = mh_q_87_cast_fp16, y = var_5895_to_fp16)[name = string("op_5896_cast_fp16")];
+            bool mh_w_57_transpose_x_0 = const()[name = string("mh_w_57_transpose_x_0"), val = bool(true)];
+            bool mh_w_57_transpose_y_0 = const()[name = string("mh_w_57_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_57_cast_fp16 = matmul(transpose_x = mh_w_57_transpose_x_0, transpose_y = mh_w_57_transpose_y_0, x = var_5896_cast_fp16, y = key_heads_59_cast_fp16)[name = string("mh_w_57_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_59_cast_fp16 = add(x = mh_w_57_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_59_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_5908_cast_fp16 = softmax(axis = var_5580, x = mh_w_59_cast_fp16)[name = string("op_5908_cast_fp16")];
+            bool attn_29_transpose_x_0 = const()[name = string("attn_29_transpose_x_0"), val = bool(false)];
+            bool attn_29_transpose_y_0 = const()[name = string("attn_29_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_29_cast_fp16 = matmul(transpose_x = attn_29_transpose_x_0, transpose_y = attn_29_transpose_y_0, x = value_heads_59_cast_fp16, y = var_5908_cast_fp16)[name = string("attn_29_cast_fp16")];
+            tensor<int32, [4]> var_5913 = const()[name = string("op_5913"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_113_cast_fp16 = reshape(shape = var_5913, x = attn_29_cast_fp16)[name = string("input_113_cast_fp16")];
+            string obj_123_pad_type_0 = const()[name = string("obj_123_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_123_strides_0 = const()[name = string("obj_123_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_123_pad_0 = const()[name = string("obj_123_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_123_dilations_0 = const()[name = string("obj_123_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_123_groups_0 = const()[name = string("obj_123_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_14_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(713300096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(717494464))))[name = string("layers_14_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_123_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_123_dilations_0, groups = obj_123_groups_0, pad = obj_123_pad_0, pad_type = obj_123_pad_type_0, strides = obj_123_strides_0, weight = layers_14_self_attn_o_proj_weight_to_fp16_palettized, x = input_113_cast_fp16)[name = string("obj_123_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_117_cast_fp16 = add(x = inputs_111_cast_fp16, y = obj_123_cast_fp16)[name = string("inputs_117_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_119_cast_fp16 = mul(x = inputs_117_cast_fp16, y = inputs_117_cast_fp16)[name = string("inputs_sq_119_cast_fp16")];
+            tensor<int32, [1]> variance_119_axes_0 = const()[name = string("variance_119_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_119_keep_dims_0 = const()[name = string("variance_119_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_119_cast_fp16 = reduce_mean(axes = variance_119_axes_0, keep_dims = variance_119_keep_dims_0, x = inputs_sq_119_cast_fp16)[name = string("variance_119_cast_fp16")];
+            fp16 var_5931_to_fp16 = const()[name = string("op_5931_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5932_cast_fp16 = add(x = variance_119_cast_fp16, y = var_5931_to_fp16)[name = string("op_5932_cast_fp16")];
+            fp32 var_5933_epsilon_0 = const()[name = string("op_5933_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5933_cast_fp16 = rsqrt(epsilon = var_5933_epsilon_0, x = var_5932_cast_fp16)[name = string("op_5933_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_147_cast_fp16 = mul(x = inputs_117_cast_fp16, y = var_5933_cast_fp16)[name = string("hidden_states_147_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_119_to_fp16 = const()[name = string("w_119_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(717495040)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_115_cast_fp16 = mul(x = w_119_to_fp16, y = hidden_states_147_cast_fp16)[name = string("input_115_cast_fp16")];
+            string input_117_pad_type_0 = const()[name = string("input_117_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_117_strides_0 = const()[name = string("input_117_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_117_pad_0 = const()[name = string("input_117_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_117_dilations_0 = const()[name = string("input_117_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_117_groups_0 = const()[name = string("input_117_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_14_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(717499200))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(730082176))))[name = string("layers_14_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_117_cast_fp16 = conv(dilations = input_117_dilations_0, groups = input_117_groups_0, pad = input_117_pad_0, pad_type = input_117_pad_type_0, strides = input_117_strides_0, weight = layers_14_mlp_gate_proj_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = string("input_117_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_5947_cast_fp16 = silu(x = input_117_cast_fp16)[name = string("op_5947_cast_fp16")];
+            string var_5953_pad_type_0 = const()[name = string("op_5953_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5953_strides_0 = const()[name = string("op_5953_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5953_pad_0 = const()[name = string("op_5953_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5953_dilations_0 = const()[name = string("op_5953_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5953_groups_0 = const()[name = string("op_5953_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_14_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(730082752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742665728))))[name = string("layers_14_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_5953_cast_fp16 = conv(dilations = var_5953_dilations_0, groups = var_5953_groups_0, pad = var_5953_pad_0, pad_type = var_5953_pad_type_0, strides = var_5953_strides_0, weight = layers_14_mlp_up_proj_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = string("op_5953_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_119_cast_fp16 = mul(x = var_5947_cast_fp16, y = var_5953_cast_fp16)[name = string("input_119_cast_fp16")];
+            string hidden_states_149_pad_type_0 = const()[name = string("hidden_states_149_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_149_strides_0 = const()[name = string("hidden_states_149_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_149_pad_0 = const()[name = string("hidden_states_149_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_149_dilations_0 = const()[name = string("hidden_states_149_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_149_groups_0 = const()[name = string("hidden_states_149_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_14_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742666304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(755249280))))[name = string("layers_14_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_149_cast_fp16 = conv(dilations = hidden_states_149_dilations_0, groups = hidden_states_149_groups_0, pad = hidden_states_149_pad_0, pad_type = hidden_states_149_pad_type_0, strides = hidden_states_149_strides_0, weight = layers_14_mlp_down_proj_weight_to_fp16_palettized, x = input_119_cast_fp16)[name = string("hidden_states_149_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_119_cast_fp16 = add(x = inputs_117_cast_fp16, y = hidden_states_149_cast_fp16)[name = string("inputs_119_cast_fp16")];
+            int32 var_5967 = const()[name = string("op_5967"), val = int32(3)];
+            int32 var_5977 = const()[name = string("op_5977"), val = int32(-2)];
+            int32 var_5985 = const()[name = string("op_5985"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_121_cast_fp16 = mul(x = inputs_119_cast_fp16, y = inputs_119_cast_fp16)[name = string("inputs_sq_121_cast_fp16")];
+            tensor<int32, [1]> variance_121_axes_0 = const()[name = string("variance_121_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_121_keep_dims_0 = const()[name = string("variance_121_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_121_cast_fp16 = reduce_mean(axes = variance_121_axes_0, keep_dims = variance_121_keep_dims_0, x = inputs_sq_121_cast_fp16)[name = string("variance_121_cast_fp16")];
+            fp16 var_5997_to_fp16 = const()[name = string("op_5997_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_5998_cast_fp16 = add(x = variance_121_cast_fp16, y = var_5997_to_fp16)[name = string("op_5998_cast_fp16")];
+            fp32 var_5999_epsilon_0 = const()[name = string("op_5999_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_5999_cast_fp16 = rsqrt(epsilon = var_5999_epsilon_0, x = var_5998_cast_fp16)[name = string("op_5999_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_151_cast_fp16 = mul(x = inputs_119_cast_fp16, y = var_5999_cast_fp16)[name = string("hidden_states_151_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_121_to_fp16 = const()[name = string("w_121_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(755249856)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_125_cast_fp16 = mul(x = w_121_to_fp16, y = hidden_states_151_cast_fp16)[name = string("obj_125_cast_fp16")];
+            string query_91_pad_type_0 = const()[name = string("query_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_91_strides_0 = const()[name = string("query_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_91_pad_0 = const()[name = string("query_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_91_dilations_0 = const()[name = string("query_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_91_groups_0 = const()[name = string("query_91_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_15_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(755254016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(759448384))))[name = string("layers_15_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_91_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_91_dilations_0, groups = query_91_groups_0, pad = query_91_pad_0, pad_type = query_91_pad_type_0, strides = query_91_strides_0, weight = layers_15_self_attn_q_proj_weight_to_fp16_palettized, x = obj_125_cast_fp16)[name = string("query_91_cast_fp16")];
+            string current_key_61_pad_type_0 = const()[name = string("current_key_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_61_strides_0 = const()[name = string("current_key_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_61_pad_0 = const()[name = string("current_key_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_61_dilations_0 = const()[name = string("current_key_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_61_groups_0 = const()[name = string("current_key_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_15_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(759448960))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(761546176))))[name = string("layers_15_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_61_cast_fp16 = conv(dilations = current_key_61_dilations_0, groups = current_key_61_groups_0, pad = current_key_61_pad_0, pad_type = current_key_61_pad_type_0, strides = current_key_61_strides_0, weight = layers_15_self_attn_k_proj_weight_to_fp16_palettized, x = obj_125_cast_fp16)[name = string("current_key_61_cast_fp16")];
+            string current_value_31_pad_type_0 = const()[name = string("current_value_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_31_strides_0 = const()[name = string("current_value_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_31_pad_0 = const()[name = string("current_value_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_31_dilations_0 = const()[name = string("current_value_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_31_groups_0 = const()[name = string("current_value_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_15_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(761546752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(763643968))))[name = string("layers_15_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_31_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_31_dilations_0, groups = current_value_31_groups_0, pad = current_value_31_pad_0, pad_type = current_value_31_pad_type_0, strides = current_value_31_strides_0, weight = layers_15_self_attn_v_proj_weight_to_fp16_palettized, x = obj_125_cast_fp16)[name = string("current_value_31_cast_fp16")];
+            tensor<int32, [4]> var_6036 = const()[name = string("op_6036"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_121_cast_fp16 = reshape(shape = var_6036, x = query_91_cast_fp16)[name = string("inputs_121_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_123_cast_fp16 = mul(x = inputs_121_cast_fp16, y = inputs_121_cast_fp16)[name = string("inputs_sq_123_cast_fp16")];
+            tensor<int32, [1]> variance_123_axes_0 = const()[name = string("variance_123_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_123_keep_dims_0 = const()[name = string("variance_123_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_123_cast_fp16 = reduce_mean(axes = variance_123_axes_0, keep_dims = variance_123_keep_dims_0, x = inputs_sq_123_cast_fp16)[name = string("variance_123_cast_fp16")];
+            fp16 var_6042_to_fp16 = const()[name = string("op_6042_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_6043_cast_fp16 = add(x = variance_123_cast_fp16, y = var_6042_to_fp16)[name = string("op_6043_cast_fp16")];
+            fp32 var_6044_epsilon_0 = const()[name = string("op_6044_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_6044_cast_fp16 = rsqrt(epsilon = var_6044_epsilon_0, x = var_6043_cast_fp16)[name = string("op_6044_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_153_cast_fp16 = mul(x = inputs_121_cast_fp16, y = var_6044_cast_fp16)[name = string("hidden_states_153_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_123_to_fp16 = const()[name = string("w_123_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(763644544)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_31_cast_fp16 = mul(x = w_123_to_fp16, y = hidden_states_153_cast_fp16)[name = string("query_normed_31_cast_fp16")];
+            tensor<int32, [4]> var_6052 = const()[name = string("op_6052"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_123_cast_fp16 = reshape(shape = var_6052, x = current_key_61_cast_fp16)[name = string("inputs_123_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_125_cast_fp16 = mul(x = inputs_123_cast_fp16, y = inputs_123_cast_fp16)[name = string("inputs_sq_125_cast_fp16")];
+            tensor<int32, [1]> variance_125_axes_0 = const()[name = string("variance_125_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_125_keep_dims_0 = const()[name = string("variance_125_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_125_cast_fp16 = reduce_mean(axes = variance_125_axes_0, keep_dims = variance_125_keep_dims_0, x = inputs_sq_125_cast_fp16)[name = string("variance_125_cast_fp16")];
+            fp16 var_6058_to_fp16 = const()[name = string("op_6058_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_6059_cast_fp16 = add(x = variance_125_cast_fp16, y = var_6058_to_fp16)[name = string("op_6059_cast_fp16")];
+            fp32 var_6060_epsilon_0 = const()[name = string("op_6060_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_6060_cast_fp16 = rsqrt(epsilon = var_6060_epsilon_0, x = var_6059_cast_fp16)[name = string("op_6060_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_155_cast_fp16 = mul(x = inputs_123_cast_fp16, y = var_6060_cast_fp16)[name = string("hidden_states_155_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_125_to_fp16 = const()[name = string("w_125_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(763644864)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_31_cast_fp16 = mul(x = w_125_to_fp16, y = hidden_states_155_cast_fp16)[name = string("current_key_normed_31_cast_fp16")];
+            tensor<int32, [4]> var_6078 = const()[name = string("op_6078"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_91_cast_fp16 = reshape(shape = var_6078, x = query_normed_31_cast_fp16)[name = string("mh_q_91_cast_fp16")];
+            tensor<int32, [4]> var_6080 = const()[name = string("op_6080"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_61_cast_fp16 = reshape(shape = var_6080, x = current_key_normed_31_cast_fp16)[name = string("mh_k_61_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6084_cast_fp16 = mul(x = mh_q_91_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6084_cast_fp16")];
+            tensor<int32, [4]> var_6089_begin_0 = const()[name = string("op_6089_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6089_end_0 = const()[name = string("op_6089_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_6089_end_mask_0 = const()[name = string("op_6089_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6089_cast_fp16 = slice_by_index(begin = var_6089_begin_0, end = var_6089_end_0, end_mask = var_6089_end_mask_0, x = mh_q_91_cast_fp16)[name = string("op_6089_cast_fp16")];
+            tensor<int32, [4]> var_6095_begin_0 = const()[name = string("op_6095_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6095_end_0 = const()[name = string("op_6095_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_6095_end_mask_0 = const()[name = string("op_6095_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6095_cast_fp16 = slice_by_index(begin = var_6095_begin_0, end = var_6095_end_0, end_mask = var_6095_end_mask_0, x = mh_q_91_cast_fp16)[name = string("op_6095_cast_fp16")];
+            fp16 const_362_promoted_to_fp16 = const()[name = string("const_362_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_6097_cast_fp16 = mul(x = var_6095_cast_fp16, y = const_362_promoted_to_fp16)[name = string("op_6097_cast_fp16")];
+            bool var_6099_interleave_0 = const()[name = string("op_6099_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_6099_cast_fp16 = concat(axis = var_5977, interleave = var_6099_interleave_0, values = (var_6097_cast_fp16, var_6089_cast_fp16))[name = string("op_6099_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6100_cast_fp16 = mul(x = var_6099_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6100_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_93_cast_fp16 = add(x = var_6084_cast_fp16, y = var_6100_cast_fp16)[name = string("mh_q_93_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6102_cast_fp16 = mul(x = mh_k_61_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6102_cast_fp16")];
+            tensor<int32, [4]> var_6107_begin_0 = const()[name = string("op_6107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6107_end_0 = const()[name = string("op_6107_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_6107_end_mask_0 = const()[name = string("op_6107_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6107_cast_fp16 = slice_by_index(begin = var_6107_begin_0, end = var_6107_end_0, end_mask = var_6107_end_mask_0, x = mh_k_61_cast_fp16)[name = string("op_6107_cast_fp16")];
+            tensor<int32, [4]> var_6113_begin_0 = const()[name = string("op_6113_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6113_end_0 = const()[name = string("op_6113_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_6113_end_mask_0 = const()[name = string("op_6113_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6113_cast_fp16 = slice_by_index(begin = var_6113_begin_0, end = var_6113_end_0, end_mask = var_6113_end_mask_0, x = mh_k_61_cast_fp16)[name = string("op_6113_cast_fp16")];
+            fp16 const_365_promoted_to_fp16 = const()[name = string("const_365_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_6115_cast_fp16 = mul(x = var_6113_cast_fp16, y = const_365_promoted_to_fp16)[name = string("op_6115_cast_fp16")];
+            bool var_6117_interleave_0 = const()[name = string("op_6117_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_6117_cast_fp16 = concat(axis = var_5977, interleave = var_6117_interleave_0, values = (var_6115_cast_fp16, var_6107_cast_fp16))[name = string("op_6117_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6118_cast_fp16 = mul(x = var_6117_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6118_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_63_cast_fp16 = add(x = var_6102_cast_fp16, y = var_6118_cast_fp16)[name = string("mh_k_63_cast_fp16")];
+            tensor<int32, [4]> var_6122 = const()[name = string("op_6122"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_63_cast_fp16 = reshape(shape = var_6122, x = mh_k_63_cast_fp16)[name = string("current_key_63_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6129_cast_fp16 = mul(x = var_101_cast_fp16_15, y = var_323_cast_fp16)[name = string("op_6129_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6130_cast_fp16 = mul(x = current_key_63_cast_fp16, y = var_321_cast_fp16)[name = string("op_6130_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_93_cast_fp16 = add(x = var_6129_cast_fp16, y = var_6130_cast_fp16)[name = string("key_93_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6133_cast_fp16 = mul(x = var_132_cast_fp16_15, y = var_323_cast_fp16)[name = string("op_6133_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6134_cast_fp16 = mul(x = current_value_31_cast_fp16, y = var_321_cast_fp16)[name = string("op_6134_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_61_cast_fp16 = add(x = var_6133_cast_fp16, y = var_6134_cast_fp16)[name = string("value_61_cast_fp16")];
+            tensor<int32, [4]> var_6138 = const()[name = string("op_6138"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_61_cast_fp16 = reshape(shape = var_6138, x = key_93_cast_fp16)[name = string("key_heads_61_cast_fp16")];
+            tensor<int32, [4]> var_6140 = const()[name = string("op_6140"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_61_cast_fp16 = reshape(shape = var_6140, x = value_61_cast_fp16)[name = string("value_heads_61_cast_fp16")];
+            tensor<int32, [4]> var_6143_begin_0 = const()[name = string("op_6143_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6143_end_0 = const()[name = string("op_6143_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6143_end_mask_0 = const()[name = string("op_6143_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6143_cast_fp16 = slice_by_index(begin = var_6143_begin_0, end = var_6143_end_0, end_mask = var_6143_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6143_cast_fp16")];
+            tensor<int32, [4]> var_6147_begin_0 = const()[name = string("op_6147_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6147_end_0 = const()[name = string("op_6147_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6147_end_mask_0 = const()[name = string("op_6147_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6147_cast_fp16 = slice_by_index(begin = var_6147_begin_0, end = var_6147_end_0, end_mask = var_6147_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6147_cast_fp16")];
+            tensor<int32, [4]> var_6159_begin_0 = const()[name = string("op_6159_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6159_end_0 = const()[name = string("op_6159_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6159_end_mask_0 = const()[name = string("op_6159_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6159_cast_fp16 = slice_by_index(begin = var_6159_begin_0, end = var_6159_end_0, end_mask = var_6159_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6159_cast_fp16")];
+            tensor<int32, [4]> var_6163_begin_0 = const()[name = string("op_6163_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6163_end_0 = const()[name = string("op_6163_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6163_end_mask_0 = const()[name = string("op_6163_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6163_cast_fp16 = slice_by_index(begin = var_6163_begin_0, end = var_6163_end_0, end_mask = var_6163_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6163_cast_fp16")];
+            tensor<int32, [4]> var_6175_begin_0 = const()[name = string("op_6175_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6175_end_0 = const()[name = string("op_6175_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6175_end_mask_0 = const()[name = string("op_6175_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6175_cast_fp16 = slice_by_index(begin = var_6175_begin_0, end = var_6175_end_0, end_mask = var_6175_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6175_cast_fp16")];
+            tensor<int32, [4]> var_6179_begin_0 = const()[name = string("op_6179_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6179_end_0 = const()[name = string("op_6179_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6179_end_mask_0 = const()[name = string("op_6179_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6179_cast_fp16 = slice_by_index(begin = var_6179_begin_0, end = var_6179_end_0, end_mask = var_6179_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6179_cast_fp16")];
+            tensor<int32, [4]> var_6191_begin_0 = const()[name = string("op_6191_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6191_end_0 = const()[name = string("op_6191_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6191_end_mask_0 = const()[name = string("op_6191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6191_cast_fp16 = slice_by_index(begin = var_6191_begin_0, end = var_6191_end_0, end_mask = var_6191_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6191_cast_fp16")];
+            tensor<int32, [4]> var_6195_begin_0 = const()[name = string("op_6195_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6195_end_0 = const()[name = string("op_6195_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6195_end_mask_0 = const()[name = string("op_6195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6195_cast_fp16 = slice_by_index(begin = var_6195_begin_0, end = var_6195_end_0, end_mask = var_6195_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6195_cast_fp16")];
+            tensor<int32, [4]> var_6207_begin_0 = const()[name = string("op_6207_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6207_end_0 = const()[name = string("op_6207_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6207_end_mask_0 = const()[name = string("op_6207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6207_cast_fp16 = slice_by_index(begin = var_6207_begin_0, end = var_6207_end_0, end_mask = var_6207_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6207_cast_fp16")];
+            tensor<int32, [4]> var_6211_begin_0 = const()[name = string("op_6211_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6211_end_0 = const()[name = string("op_6211_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6211_end_mask_0 = const()[name = string("op_6211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6211_cast_fp16 = slice_by_index(begin = var_6211_begin_0, end = var_6211_end_0, end_mask = var_6211_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6211_cast_fp16")];
+            tensor<int32, [4]> var_6223_begin_0 = const()[name = string("op_6223_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6223_end_0 = const()[name = string("op_6223_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6223_end_mask_0 = const()[name = string("op_6223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6223_cast_fp16 = slice_by_index(begin = var_6223_begin_0, end = var_6223_end_0, end_mask = var_6223_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6223_cast_fp16")];
+            tensor<int32, [4]> var_6227_begin_0 = const()[name = string("op_6227_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6227_end_0 = const()[name = string("op_6227_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6227_end_mask_0 = const()[name = string("op_6227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6227_cast_fp16 = slice_by_index(begin = var_6227_begin_0, end = var_6227_end_0, end_mask = var_6227_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6227_cast_fp16")];
+            tensor<int32, [4]> var_6239_begin_0 = const()[name = string("op_6239_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6239_end_0 = const()[name = string("op_6239_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6239_end_mask_0 = const()[name = string("op_6239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6239_cast_fp16 = slice_by_index(begin = var_6239_begin_0, end = var_6239_end_0, end_mask = var_6239_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6239_cast_fp16")];
+            tensor<int32, [4]> var_6243_begin_0 = const()[name = string("op_6243_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6243_end_0 = const()[name = string("op_6243_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6243_end_mask_0 = const()[name = string("op_6243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6243_cast_fp16 = slice_by_index(begin = var_6243_begin_0, end = var_6243_end_0, end_mask = var_6243_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6243_cast_fp16")];
+            tensor<int32, [4]> var_6255_begin_0 = const()[name = string("op_6255_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6255_end_0 = const()[name = string("op_6255_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6255_end_mask_0 = const()[name = string("op_6255_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6255_cast_fp16 = slice_by_index(begin = var_6255_begin_0, end = var_6255_end_0, end_mask = var_6255_end_mask_0, x = key_heads_61_cast_fp16)[name = string("op_6255_cast_fp16")];
+            tensor<int32, [4]> var_6259_begin_0 = const()[name = string("op_6259_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6259_end_0 = const()[name = string("op_6259_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6259_end_mask_0 = const()[name = string("op_6259_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6259_cast_fp16 = slice_by_index(begin = var_6259_begin_0, end = var_6259_end_0, end_mask = var_6259_end_mask_0, x = value_heads_61_cast_fp16)[name = string("op_6259_cast_fp16")];
+            bool key_heads_63_interleave_0 = const()[name = string("key_heads_63_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_63_cast_fp16 = concat(axis = var_5985, interleave = key_heads_63_interleave_0, values = (var_6143_cast_fp16, var_6143_cast_fp16, var_6159_cast_fp16, var_6159_cast_fp16, var_6175_cast_fp16, var_6175_cast_fp16, var_6191_cast_fp16, var_6191_cast_fp16, var_6207_cast_fp16, var_6207_cast_fp16, var_6223_cast_fp16, var_6223_cast_fp16, var_6239_cast_fp16, var_6239_cast_fp16, var_6255_cast_fp16, var_6255_cast_fp16))[name = string("key_heads_63_cast_fp16")];
+            bool value_heads_63_interleave_0 = const()[name = string("value_heads_63_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_63_cast_fp16 = concat(axis = var_5985, interleave = value_heads_63_interleave_0, values = (var_6147_cast_fp16, var_6147_cast_fp16, var_6163_cast_fp16, var_6163_cast_fp16, var_6179_cast_fp16, var_6179_cast_fp16, var_6195_cast_fp16, var_6195_cast_fp16, var_6211_cast_fp16, var_6211_cast_fp16, var_6227_cast_fp16, var_6227_cast_fp16, var_6243_cast_fp16, var_6243_cast_fp16, var_6259_cast_fp16, var_6259_cast_fp16))[name = string("value_heads_63_cast_fp16")];
+            fp16 var_6282_to_fp16 = const()[name = string("op_6282_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_6283_cast_fp16 = mul(x = mh_q_93_cast_fp16, y = var_6282_to_fp16)[name = string("op_6283_cast_fp16")];
+            bool mh_w_61_transpose_x_0 = const()[name = string("mh_w_61_transpose_x_0"), val = bool(true)];
+            bool mh_w_61_transpose_y_0 = const()[name = string("mh_w_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_61_cast_fp16 = matmul(transpose_x = mh_w_61_transpose_x_0, transpose_y = mh_w_61_transpose_y_0, x = var_6283_cast_fp16, y = key_heads_63_cast_fp16)[name = string("mh_w_61_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_63_cast_fp16 = add(x = mh_w_61_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_63_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_6295_cast_fp16 = softmax(axis = var_5967, x = mh_w_63_cast_fp16)[name = string("op_6295_cast_fp16")];
+            bool attn_31_transpose_x_0 = const()[name = string("attn_31_transpose_x_0"), val = bool(false)];
+            bool attn_31_transpose_y_0 = const()[name = string("attn_31_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_31_cast_fp16 = matmul(transpose_x = attn_31_transpose_x_0, transpose_y = attn_31_transpose_y_0, x = value_heads_63_cast_fp16, y = var_6295_cast_fp16)[name = string("attn_31_cast_fp16")];
+            tensor<int32, [4]> var_6300 = const()[name = string("op_6300"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_121_cast_fp16 = reshape(shape = var_6300, x = attn_31_cast_fp16)[name = string("input_121_cast_fp16")];
+            string obj_131_pad_type_0 = const()[name = string("obj_131_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_131_strides_0 = const()[name = string("obj_131_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_131_pad_0 = const()[name = string("obj_131_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_131_dilations_0 = const()[name = string("obj_131_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_131_groups_0 = const()[name = string("obj_131_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_15_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(763645184))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(767839552))))[name = string("layers_15_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_131_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_131_dilations_0, groups = obj_131_groups_0, pad = obj_131_pad_0, pad_type = obj_131_pad_type_0, strides = obj_131_strides_0, weight = layers_15_self_attn_o_proj_weight_to_fp16_palettized, x = input_121_cast_fp16)[name = string("obj_131_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_125_cast_fp16 = add(x = inputs_119_cast_fp16, y = obj_131_cast_fp16)[name = string("inputs_125_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_127_cast_fp16 = mul(x = inputs_125_cast_fp16, y = inputs_125_cast_fp16)[name = string("inputs_sq_127_cast_fp16")];
+            tensor<int32, [1]> variance_127_axes_0 = const()[name = string("variance_127_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_127_keep_dims_0 = const()[name = string("variance_127_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_127_cast_fp16 = reduce_mean(axes = variance_127_axes_0, keep_dims = variance_127_keep_dims_0, x = inputs_sq_127_cast_fp16)[name = string("variance_127_cast_fp16")];
+            fp16 var_6318_to_fp16 = const()[name = string("op_6318_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6319_cast_fp16 = add(x = variance_127_cast_fp16, y = var_6318_to_fp16)[name = string("op_6319_cast_fp16")];
+            fp32 var_6320_epsilon_0 = const()[name = string("op_6320_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6320_cast_fp16 = rsqrt(epsilon = var_6320_epsilon_0, x = var_6319_cast_fp16)[name = string("op_6320_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_157_cast_fp16 = mul(x = inputs_125_cast_fp16, y = var_6320_cast_fp16)[name = string("hidden_states_157_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_127_to_fp16 = const()[name = string("w_127_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(767840128)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_123_cast_fp16 = mul(x = w_127_to_fp16, y = hidden_states_157_cast_fp16)[name = string("input_123_cast_fp16")];
+            string input_125_pad_type_0 = const()[name = string("input_125_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_125_strides_0 = const()[name = string("input_125_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_125_pad_0 = const()[name = string("input_125_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_125_dilations_0 = const()[name = string("input_125_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_125_groups_0 = const()[name = string("input_125_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_15_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(767844288))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(780427264))))[name = string("layers_15_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_125_cast_fp16 = conv(dilations = input_125_dilations_0, groups = input_125_groups_0, pad = input_125_pad_0, pad_type = input_125_pad_type_0, strides = input_125_strides_0, weight = layers_15_mlp_gate_proj_weight_to_fp16_palettized, x = input_123_cast_fp16)[name = string("input_125_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_6334_cast_fp16 = silu(x = input_125_cast_fp16)[name = string("op_6334_cast_fp16")];
+            string var_6340_pad_type_0 = const()[name = string("op_6340_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6340_strides_0 = const()[name = string("op_6340_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6340_pad_0 = const()[name = string("op_6340_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6340_dilations_0 = const()[name = string("op_6340_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6340_groups_0 = const()[name = string("op_6340_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_15_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(780427840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(793010816))))[name = string("layers_15_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_6340_cast_fp16 = conv(dilations = var_6340_dilations_0, groups = var_6340_groups_0, pad = var_6340_pad_0, pad_type = var_6340_pad_type_0, strides = var_6340_strides_0, weight = layers_15_mlp_up_proj_weight_to_fp16_palettized, x = input_123_cast_fp16)[name = string("op_6340_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_127_cast_fp16 = mul(x = var_6334_cast_fp16, y = var_6340_cast_fp16)[name = string("input_127_cast_fp16")];
+            string hidden_states_159_pad_type_0 = const()[name = string("hidden_states_159_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_159_strides_0 = const()[name = string("hidden_states_159_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_159_pad_0 = const()[name = string("hidden_states_159_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_159_dilations_0 = const()[name = string("hidden_states_159_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_159_groups_0 = const()[name = string("hidden_states_159_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_15_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(793011392))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(805594368))))[name = string("layers_15_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_159_cast_fp16 = conv(dilations = hidden_states_159_dilations_0, groups = hidden_states_159_groups_0, pad = hidden_states_159_pad_0, pad_type = hidden_states_159_pad_type_0, strides = hidden_states_159_strides_0, weight = layers_15_mlp_down_proj_weight_to_fp16_palettized, x = input_127_cast_fp16)[name = string("hidden_states_159_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_127_cast_fp16 = add(x = inputs_125_cast_fp16, y = hidden_states_159_cast_fp16)[name = string("inputs_127_cast_fp16")];
+            int32 var_6354 = const()[name = string("op_6354"), val = int32(3)];
+            int32 var_6364 = const()[name = string("op_6364"), val = int32(-2)];
+            int32 var_6372 = const()[name = string("op_6372"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_129_cast_fp16 = mul(x = inputs_127_cast_fp16, y = inputs_127_cast_fp16)[name = string("inputs_sq_129_cast_fp16")];
+            tensor<int32, [1]> variance_129_axes_0 = const()[name = string("variance_129_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_129_keep_dims_0 = const()[name = string("variance_129_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_129_cast_fp16 = reduce_mean(axes = variance_129_axes_0, keep_dims = variance_129_keep_dims_0, x = inputs_sq_129_cast_fp16)[name = string("variance_129_cast_fp16")];
+            fp16 var_6384_to_fp16 = const()[name = string("op_6384_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6385_cast_fp16 = add(x = variance_129_cast_fp16, y = var_6384_to_fp16)[name = string("op_6385_cast_fp16")];
+            fp32 var_6386_epsilon_0 = const()[name = string("op_6386_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6386_cast_fp16 = rsqrt(epsilon = var_6386_epsilon_0, x = var_6385_cast_fp16)[name = string("op_6386_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_161_cast_fp16 = mul(x = inputs_127_cast_fp16, y = var_6386_cast_fp16)[name = string("hidden_states_161_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_129_to_fp16 = const()[name = string("w_129_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(805594944)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_133_cast_fp16 = mul(x = w_129_to_fp16, y = hidden_states_161_cast_fp16)[name = string("obj_133_cast_fp16")];
+            string query_97_pad_type_0 = const()[name = string("query_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_97_strides_0 = const()[name = string("query_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_97_pad_0 = const()[name = string("query_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_97_dilations_0 = const()[name = string("query_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_97_groups_0 = const()[name = string("query_97_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_16_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(805599104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(809793472))))[name = string("layers_16_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_97_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_97_dilations_0, groups = query_97_groups_0, pad = query_97_pad_0, pad_type = query_97_pad_type_0, strides = query_97_strides_0, weight = layers_16_self_attn_q_proj_weight_to_fp16_palettized, x = obj_133_cast_fp16)[name = string("query_97_cast_fp16")];
+            string current_key_65_pad_type_0 = const()[name = string("current_key_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_65_strides_0 = const()[name = string("current_key_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_65_pad_0 = const()[name = string("current_key_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_65_dilations_0 = const()[name = string("current_key_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_65_groups_0 = const()[name = string("current_key_65_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_16_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(809794048))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(811891264))))[name = string("layers_16_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_65_cast_fp16 = conv(dilations = current_key_65_dilations_0, groups = current_key_65_groups_0, pad = current_key_65_pad_0, pad_type = current_key_65_pad_type_0, strides = current_key_65_strides_0, weight = layers_16_self_attn_k_proj_weight_to_fp16_palettized, x = obj_133_cast_fp16)[name = string("current_key_65_cast_fp16")];
+            string current_value_33_pad_type_0 = const()[name = string("current_value_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_33_strides_0 = const()[name = string("current_value_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_33_pad_0 = const()[name = string("current_value_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_33_dilations_0 = const()[name = string("current_value_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_33_groups_0 = const()[name = string("current_value_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_16_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(811891840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(813989056))))[name = string("layers_16_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_33_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_33_dilations_0, groups = current_value_33_groups_0, pad = current_value_33_pad_0, pad_type = current_value_33_pad_type_0, strides = current_value_33_strides_0, weight = layers_16_self_attn_v_proj_weight_to_fp16_palettized, x = obj_133_cast_fp16)[name = string("current_value_33_cast_fp16")];
+            tensor<int32, [4]> var_6423 = const()[name = string("op_6423"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_129_cast_fp16 = reshape(shape = var_6423, x = query_97_cast_fp16)[name = string("inputs_129_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_131_cast_fp16 = mul(x = inputs_129_cast_fp16, y = inputs_129_cast_fp16)[name = string("inputs_sq_131_cast_fp16")];
+            tensor<int32, [1]> variance_131_axes_0 = const()[name = string("variance_131_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_131_keep_dims_0 = const()[name = string("variance_131_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_131_cast_fp16 = reduce_mean(axes = variance_131_axes_0, keep_dims = variance_131_keep_dims_0, x = inputs_sq_131_cast_fp16)[name = string("variance_131_cast_fp16")];
+            fp16 var_6429_to_fp16 = const()[name = string("op_6429_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_6430_cast_fp16 = add(x = variance_131_cast_fp16, y = var_6429_to_fp16)[name = string("op_6430_cast_fp16")];
+            fp32 var_6431_epsilon_0 = const()[name = string("op_6431_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_6431_cast_fp16 = rsqrt(epsilon = var_6431_epsilon_0, x = var_6430_cast_fp16)[name = string("op_6431_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_163_cast_fp16 = mul(x = inputs_129_cast_fp16, y = var_6431_cast_fp16)[name = string("hidden_states_163_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_131_to_fp16 = const()[name = string("w_131_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(813989632)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_33_cast_fp16 = mul(x = w_131_to_fp16, y = hidden_states_163_cast_fp16)[name = string("query_normed_33_cast_fp16")];
+            tensor<int32, [4]> var_6439 = const()[name = string("op_6439"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_131_cast_fp16 = reshape(shape = var_6439, x = current_key_65_cast_fp16)[name = string("inputs_131_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_133_cast_fp16 = mul(x = inputs_131_cast_fp16, y = inputs_131_cast_fp16)[name = string("inputs_sq_133_cast_fp16")];
+            tensor<int32, [1]> variance_133_axes_0 = const()[name = string("variance_133_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_133_keep_dims_0 = const()[name = string("variance_133_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_133_cast_fp16 = reduce_mean(axes = variance_133_axes_0, keep_dims = variance_133_keep_dims_0, x = inputs_sq_133_cast_fp16)[name = string("variance_133_cast_fp16")];
+            fp16 var_6445_to_fp16 = const()[name = string("op_6445_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_6446_cast_fp16 = add(x = variance_133_cast_fp16, y = var_6445_to_fp16)[name = string("op_6446_cast_fp16")];
+            fp32 var_6447_epsilon_0 = const()[name = string("op_6447_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_6447_cast_fp16 = rsqrt(epsilon = var_6447_epsilon_0, x = var_6446_cast_fp16)[name = string("op_6447_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_165_cast_fp16 = mul(x = inputs_131_cast_fp16, y = var_6447_cast_fp16)[name = string("hidden_states_165_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_133_to_fp16 = const()[name = string("w_133_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(813989952)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_33_cast_fp16 = mul(x = w_133_to_fp16, y = hidden_states_165_cast_fp16)[name = string("current_key_normed_33_cast_fp16")];
+            tensor<int32, [4]> var_6465 = const()[name = string("op_6465"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_97_cast_fp16 = reshape(shape = var_6465, x = query_normed_33_cast_fp16)[name = string("mh_q_97_cast_fp16")];
+            tensor<int32, [4]> var_6467 = const()[name = string("op_6467"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_65_cast_fp16 = reshape(shape = var_6467, x = current_key_normed_33_cast_fp16)[name = string("mh_k_65_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6471_cast_fp16 = mul(x = mh_q_97_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6471_cast_fp16")];
+            tensor<int32, [4]> var_6476_begin_0 = const()[name = string("op_6476_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6476_end_0 = const()[name = string("op_6476_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_6476_end_mask_0 = const()[name = string("op_6476_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6476_cast_fp16 = slice_by_index(begin = var_6476_begin_0, end = var_6476_end_0, end_mask = var_6476_end_mask_0, x = mh_q_97_cast_fp16)[name = string("op_6476_cast_fp16")];
+            tensor<int32, [4]> var_6482_begin_0 = const()[name = string("op_6482_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6482_end_0 = const()[name = string("op_6482_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_6482_end_mask_0 = const()[name = string("op_6482_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6482_cast_fp16 = slice_by_index(begin = var_6482_begin_0, end = var_6482_end_0, end_mask = var_6482_end_mask_0, x = mh_q_97_cast_fp16)[name = string("op_6482_cast_fp16")];
+            fp16 const_385_promoted_to_fp16 = const()[name = string("const_385_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_6484_cast_fp16 = mul(x = var_6482_cast_fp16, y = const_385_promoted_to_fp16)[name = string("op_6484_cast_fp16")];
+            bool var_6486_interleave_0 = const()[name = string("op_6486_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_6486_cast_fp16 = concat(axis = var_6364, interleave = var_6486_interleave_0, values = (var_6484_cast_fp16, var_6476_cast_fp16))[name = string("op_6486_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6487_cast_fp16 = mul(x = var_6486_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6487_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_99_cast_fp16 = add(x = var_6471_cast_fp16, y = var_6487_cast_fp16)[name = string("mh_q_99_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6489_cast_fp16 = mul(x = mh_k_65_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6489_cast_fp16")];
+            tensor<int32, [4]> var_6494_begin_0 = const()[name = string("op_6494_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6494_end_0 = const()[name = string("op_6494_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_6494_end_mask_0 = const()[name = string("op_6494_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6494_cast_fp16 = slice_by_index(begin = var_6494_begin_0, end = var_6494_end_0, end_mask = var_6494_end_mask_0, x = mh_k_65_cast_fp16)[name = string("op_6494_cast_fp16")];
+            tensor<int32, [4]> var_6500_begin_0 = const()[name = string("op_6500_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6500_end_0 = const()[name = string("op_6500_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_6500_end_mask_0 = const()[name = string("op_6500_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6500_cast_fp16 = slice_by_index(begin = var_6500_begin_0, end = var_6500_end_0, end_mask = var_6500_end_mask_0, x = mh_k_65_cast_fp16)[name = string("op_6500_cast_fp16")];
+            fp16 const_388_promoted_to_fp16 = const()[name = string("const_388_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_6502_cast_fp16 = mul(x = var_6500_cast_fp16, y = const_388_promoted_to_fp16)[name = string("op_6502_cast_fp16")];
+            bool var_6504_interleave_0 = const()[name = string("op_6504_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_6504_cast_fp16 = concat(axis = var_6364, interleave = var_6504_interleave_0, values = (var_6502_cast_fp16, var_6494_cast_fp16))[name = string("op_6504_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6505_cast_fp16 = mul(x = var_6504_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6505_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_67_cast_fp16 = add(x = var_6489_cast_fp16, y = var_6505_cast_fp16)[name = string("mh_k_67_cast_fp16")];
+            tensor<int32, [4]> var_6509 = const()[name = string("op_6509"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_67_cast_fp16 = reshape(shape = var_6509, x = mh_k_67_cast_fp16)[name = string("current_key_67_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6516_cast_fp16 = mul(x = var_101_cast_fp16_16, y = var_323_cast_fp16)[name = string("op_6516_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6517_cast_fp16 = mul(x = current_key_67_cast_fp16, y = var_321_cast_fp16)[name = string("op_6517_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_99_cast_fp16 = add(x = var_6516_cast_fp16, y = var_6517_cast_fp16)[name = string("key_99_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6520_cast_fp16 = mul(x = var_132_cast_fp16_16, y = var_323_cast_fp16)[name = string("op_6520_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6521_cast_fp16 = mul(x = current_value_33_cast_fp16, y = var_321_cast_fp16)[name = string("op_6521_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_65_cast_fp16 = add(x = var_6520_cast_fp16, y = var_6521_cast_fp16)[name = string("value_65_cast_fp16")];
+            tensor<int32, [4]> var_6525 = const()[name = string("op_6525"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_65_cast_fp16 = reshape(shape = var_6525, x = key_99_cast_fp16)[name = string("key_heads_65_cast_fp16")];
+            tensor<int32, [4]> var_6527 = const()[name = string("op_6527"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_65_cast_fp16 = reshape(shape = var_6527, x = value_65_cast_fp16)[name = string("value_heads_65_cast_fp16")];
+            tensor<int32, [4]> var_6530_begin_0 = const()[name = string("op_6530_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6530_end_0 = const()[name = string("op_6530_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6530_end_mask_0 = const()[name = string("op_6530_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6530_cast_fp16 = slice_by_index(begin = var_6530_begin_0, end = var_6530_end_0, end_mask = var_6530_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6530_cast_fp16")];
+            tensor<int32, [4]> var_6534_begin_0 = const()[name = string("op_6534_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6534_end_0 = const()[name = string("op_6534_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6534_end_mask_0 = const()[name = string("op_6534_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6534_cast_fp16 = slice_by_index(begin = var_6534_begin_0, end = var_6534_end_0, end_mask = var_6534_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6534_cast_fp16")];
+            tensor<int32, [4]> var_6546_begin_0 = const()[name = string("op_6546_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6546_end_0 = const()[name = string("op_6546_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6546_end_mask_0 = const()[name = string("op_6546_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6546_cast_fp16 = slice_by_index(begin = var_6546_begin_0, end = var_6546_end_0, end_mask = var_6546_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6546_cast_fp16")];
+            tensor<int32, [4]> var_6550_begin_0 = const()[name = string("op_6550_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6550_end_0 = const()[name = string("op_6550_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6550_end_mask_0 = const()[name = string("op_6550_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6550_cast_fp16 = slice_by_index(begin = var_6550_begin_0, end = var_6550_end_0, end_mask = var_6550_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6550_cast_fp16")];
+            tensor<int32, [4]> var_6562_begin_0 = const()[name = string("op_6562_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6562_end_0 = const()[name = string("op_6562_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6562_end_mask_0 = const()[name = string("op_6562_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6562_cast_fp16 = slice_by_index(begin = var_6562_begin_0, end = var_6562_end_0, end_mask = var_6562_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6562_cast_fp16")];
+            tensor<int32, [4]> var_6566_begin_0 = const()[name = string("op_6566_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6566_end_0 = const()[name = string("op_6566_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6566_end_mask_0 = const()[name = string("op_6566_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6566_cast_fp16 = slice_by_index(begin = var_6566_begin_0, end = var_6566_end_0, end_mask = var_6566_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6566_cast_fp16")];
+            tensor<int32, [4]> var_6578_begin_0 = const()[name = string("op_6578_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6578_end_0 = const()[name = string("op_6578_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6578_end_mask_0 = const()[name = string("op_6578_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6578_cast_fp16 = slice_by_index(begin = var_6578_begin_0, end = var_6578_end_0, end_mask = var_6578_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6578_cast_fp16")];
+            tensor<int32, [4]> var_6582_begin_0 = const()[name = string("op_6582_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6582_end_0 = const()[name = string("op_6582_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6582_end_mask_0 = const()[name = string("op_6582_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6582_cast_fp16 = slice_by_index(begin = var_6582_begin_0, end = var_6582_end_0, end_mask = var_6582_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6582_cast_fp16")];
+            tensor<int32, [4]> var_6594_begin_0 = const()[name = string("op_6594_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6594_end_0 = const()[name = string("op_6594_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6594_end_mask_0 = const()[name = string("op_6594_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6594_cast_fp16 = slice_by_index(begin = var_6594_begin_0, end = var_6594_end_0, end_mask = var_6594_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6594_cast_fp16")];
+            tensor<int32, [4]> var_6598_begin_0 = const()[name = string("op_6598_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6598_end_0 = const()[name = string("op_6598_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6598_end_mask_0 = const()[name = string("op_6598_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6598_cast_fp16 = slice_by_index(begin = var_6598_begin_0, end = var_6598_end_0, end_mask = var_6598_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6598_cast_fp16")];
+            tensor<int32, [4]> var_6610_begin_0 = const()[name = string("op_6610_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6610_end_0 = const()[name = string("op_6610_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6610_end_mask_0 = const()[name = string("op_6610_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6610_cast_fp16 = slice_by_index(begin = var_6610_begin_0, end = var_6610_end_0, end_mask = var_6610_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6610_cast_fp16")];
+            tensor<int32, [4]> var_6614_begin_0 = const()[name = string("op_6614_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6614_end_0 = const()[name = string("op_6614_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6614_end_mask_0 = const()[name = string("op_6614_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6614_cast_fp16 = slice_by_index(begin = var_6614_begin_0, end = var_6614_end_0, end_mask = var_6614_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6614_cast_fp16")];
+            tensor<int32, [4]> var_6626_begin_0 = const()[name = string("op_6626_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6626_end_0 = const()[name = string("op_6626_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6626_end_mask_0 = const()[name = string("op_6626_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6626_cast_fp16 = slice_by_index(begin = var_6626_begin_0, end = var_6626_end_0, end_mask = var_6626_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6626_cast_fp16")];
+            tensor<int32, [4]> var_6630_begin_0 = const()[name = string("op_6630_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_6630_end_0 = const()[name = string("op_6630_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_6630_end_mask_0 = const()[name = string("op_6630_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6630_cast_fp16 = slice_by_index(begin = var_6630_begin_0, end = var_6630_end_0, end_mask = var_6630_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6630_cast_fp16")];
+            tensor<int32, [4]> var_6642_begin_0 = const()[name = string("op_6642_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6642_end_0 = const()[name = string("op_6642_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6642_end_mask_0 = const()[name = string("op_6642_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6642_cast_fp16 = slice_by_index(begin = var_6642_begin_0, end = var_6642_end_0, end_mask = var_6642_end_mask_0, x = key_heads_65_cast_fp16)[name = string("op_6642_cast_fp16")];
+            tensor<int32, [4]> var_6646_begin_0 = const()[name = string("op_6646_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_6646_end_0 = const()[name = string("op_6646_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6646_end_mask_0 = const()[name = string("op_6646_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6646_cast_fp16 = slice_by_index(begin = var_6646_begin_0, end = var_6646_end_0, end_mask = var_6646_end_mask_0, x = value_heads_65_cast_fp16)[name = string("op_6646_cast_fp16")];
+            bool key_heads_67_interleave_0 = const()[name = string("key_heads_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_67_cast_fp16 = concat(axis = var_6372, interleave = key_heads_67_interleave_0, values = (var_6530_cast_fp16, var_6530_cast_fp16, var_6546_cast_fp16, var_6546_cast_fp16, var_6562_cast_fp16, var_6562_cast_fp16, var_6578_cast_fp16, var_6578_cast_fp16, var_6594_cast_fp16, var_6594_cast_fp16, var_6610_cast_fp16, var_6610_cast_fp16, var_6626_cast_fp16, var_6626_cast_fp16, var_6642_cast_fp16, var_6642_cast_fp16))[name = string("key_heads_67_cast_fp16")];
+            bool value_heads_67_interleave_0 = const()[name = string("value_heads_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_67_cast_fp16 = concat(axis = var_6372, interleave = value_heads_67_interleave_0, values = (var_6534_cast_fp16, var_6534_cast_fp16, var_6550_cast_fp16, var_6550_cast_fp16, var_6566_cast_fp16, var_6566_cast_fp16, var_6582_cast_fp16, var_6582_cast_fp16, var_6598_cast_fp16, var_6598_cast_fp16, var_6614_cast_fp16, var_6614_cast_fp16, var_6630_cast_fp16, var_6630_cast_fp16, var_6646_cast_fp16, var_6646_cast_fp16))[name = string("value_heads_67_cast_fp16")];
+            fp16 var_6669_to_fp16 = const()[name = string("op_6669_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_6670_cast_fp16 = mul(x = mh_q_99_cast_fp16, y = var_6669_to_fp16)[name = string("op_6670_cast_fp16")];
+            bool mh_w_65_transpose_x_0 = const()[name = string("mh_w_65_transpose_x_0"), val = bool(true)];
+            bool mh_w_65_transpose_y_0 = const()[name = string("mh_w_65_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_65_cast_fp16 = matmul(transpose_x = mh_w_65_transpose_x_0, transpose_y = mh_w_65_transpose_y_0, x = var_6670_cast_fp16, y = key_heads_67_cast_fp16)[name = string("mh_w_65_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_67_cast_fp16 = add(x = mh_w_65_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_67_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_6682_cast_fp16 = softmax(axis = var_6354, x = mh_w_67_cast_fp16)[name = string("op_6682_cast_fp16")];
+            bool attn_33_transpose_x_0 = const()[name = string("attn_33_transpose_x_0"), val = bool(false)];
+            bool attn_33_transpose_y_0 = const()[name = string("attn_33_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_33_cast_fp16 = matmul(transpose_x = attn_33_transpose_x_0, transpose_y = attn_33_transpose_y_0, x = value_heads_67_cast_fp16, y = var_6682_cast_fp16)[name = string("attn_33_cast_fp16")];
+            tensor<int32, [4]> var_6687 = const()[name = string("op_6687"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_129_cast_fp16 = reshape(shape = var_6687, x = attn_33_cast_fp16)[name = string("input_129_cast_fp16")];
+            string obj_139_pad_type_0 = const()[name = string("obj_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_139_strides_0 = const()[name = string("obj_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_139_pad_0 = const()[name = string("obj_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_139_dilations_0 = const()[name = string("obj_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_139_groups_0 = const()[name = string("obj_139_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_16_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(813990272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(818184640))))[name = string("layers_16_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_139_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_139_dilations_0, groups = obj_139_groups_0, pad = obj_139_pad_0, pad_type = obj_139_pad_type_0, strides = obj_139_strides_0, weight = layers_16_self_attn_o_proj_weight_to_fp16_palettized, x = input_129_cast_fp16)[name = string("obj_139_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_133_cast_fp16 = add(x = inputs_127_cast_fp16, y = obj_139_cast_fp16)[name = string("inputs_133_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_135_cast_fp16 = mul(x = inputs_133_cast_fp16, y = inputs_133_cast_fp16)[name = string("inputs_sq_135_cast_fp16")];
+            tensor<int32, [1]> variance_135_axes_0 = const()[name = string("variance_135_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_135_keep_dims_0 = const()[name = string("variance_135_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_135_cast_fp16 = reduce_mean(axes = variance_135_axes_0, keep_dims = variance_135_keep_dims_0, x = inputs_sq_135_cast_fp16)[name = string("variance_135_cast_fp16")];
+            fp16 var_6705_to_fp16 = const()[name = string("op_6705_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6706_cast_fp16 = add(x = variance_135_cast_fp16, y = var_6705_to_fp16)[name = string("op_6706_cast_fp16")];
+            fp32 var_6707_epsilon_0 = const()[name = string("op_6707_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6707_cast_fp16 = rsqrt(epsilon = var_6707_epsilon_0, x = var_6706_cast_fp16)[name = string("op_6707_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_167_cast_fp16 = mul(x = inputs_133_cast_fp16, y = var_6707_cast_fp16)[name = string("hidden_states_167_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_135_to_fp16 = const()[name = string("w_135_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(818185216)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_131_cast_fp16 = mul(x = w_135_to_fp16, y = hidden_states_167_cast_fp16)[name = string("input_131_cast_fp16")];
+            string input_133_pad_type_0 = const()[name = string("input_133_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_133_strides_0 = const()[name = string("input_133_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_133_pad_0 = const()[name = string("input_133_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_133_dilations_0 = const()[name = string("input_133_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_133_groups_0 = const()[name = string("input_133_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_16_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(818189376))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(830772352))))[name = string("layers_16_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_133_cast_fp16 = conv(dilations = input_133_dilations_0, groups = input_133_groups_0, pad = input_133_pad_0, pad_type = input_133_pad_type_0, strides = input_133_strides_0, weight = layers_16_mlp_gate_proj_weight_to_fp16_palettized, x = input_131_cast_fp16)[name = string("input_133_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_6721_cast_fp16 = silu(x = input_133_cast_fp16)[name = string("op_6721_cast_fp16")];
+            string var_6727_pad_type_0 = const()[name = string("op_6727_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6727_strides_0 = const()[name = string("op_6727_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6727_pad_0 = const()[name = string("op_6727_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6727_dilations_0 = const()[name = string("op_6727_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6727_groups_0 = const()[name = string("op_6727_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_16_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(830772928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(843355904))))[name = string("layers_16_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_6727_cast_fp16 = conv(dilations = var_6727_dilations_0, groups = var_6727_groups_0, pad = var_6727_pad_0, pad_type = var_6727_pad_type_0, strides = var_6727_strides_0, weight = layers_16_mlp_up_proj_weight_to_fp16_palettized, x = input_131_cast_fp16)[name = string("op_6727_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_135_cast_fp16 = mul(x = var_6721_cast_fp16, y = var_6727_cast_fp16)[name = string("input_135_cast_fp16")];
+            string hidden_states_169_pad_type_0 = const()[name = string("hidden_states_169_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_169_strides_0 = const()[name = string("hidden_states_169_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_169_pad_0 = const()[name = string("hidden_states_169_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_169_dilations_0 = const()[name = string("hidden_states_169_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_169_groups_0 = const()[name = string("hidden_states_169_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_16_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(843356480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(855939456))))[name = string("layers_16_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_169_cast_fp16 = conv(dilations = hidden_states_169_dilations_0, groups = hidden_states_169_groups_0, pad = hidden_states_169_pad_0, pad_type = hidden_states_169_pad_type_0, strides = hidden_states_169_strides_0, weight = layers_16_mlp_down_proj_weight_to_fp16_palettized, x = input_135_cast_fp16)[name = string("hidden_states_169_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_135_cast_fp16 = add(x = inputs_133_cast_fp16, y = hidden_states_169_cast_fp16)[name = string("inputs_135_cast_fp16")];
+            int32 var_6741 = const()[name = string("op_6741"), val = int32(3)];
+            int32 var_6751 = const()[name = string("op_6751"), val = int32(-2)];
+            int32 var_6759 = const()[name = string("op_6759"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_137_cast_fp16 = mul(x = inputs_135_cast_fp16, y = inputs_135_cast_fp16)[name = string("inputs_sq_137_cast_fp16")];
+            tensor<int32, [1]> variance_137_axes_0 = const()[name = string("variance_137_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_137_keep_dims_0 = const()[name = string("variance_137_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_137_cast_fp16 = reduce_mean(axes = variance_137_axes_0, keep_dims = variance_137_keep_dims_0, x = inputs_sq_137_cast_fp16)[name = string("variance_137_cast_fp16")];
+            fp16 var_6771_to_fp16 = const()[name = string("op_6771_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_6772_cast_fp16 = add(x = variance_137_cast_fp16, y = var_6771_to_fp16)[name = string("op_6772_cast_fp16")];
+            fp32 var_6773_epsilon_0 = const()[name = string("op_6773_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_6773_cast_fp16 = rsqrt(epsilon = var_6773_epsilon_0, x = var_6772_cast_fp16)[name = string("op_6773_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_171_cast_fp16 = mul(x = inputs_135_cast_fp16, y = var_6773_cast_fp16)[name = string("hidden_states_171_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_137_to_fp16 = const()[name = string("w_137_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(855940032)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_141_cast_fp16 = mul(x = w_137_to_fp16, y = hidden_states_171_cast_fp16)[name = string("obj_141_cast_fp16")];
+            string query_103_pad_type_0 = const()[name = string("query_103_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_103_strides_0 = const()[name = string("query_103_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_103_pad_0 = const()[name = string("query_103_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_103_dilations_0 = const()[name = string("query_103_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_103_groups_0 = const()[name = string("query_103_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_17_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(855944192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(860138560))))[name = string("layers_17_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_103_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_103_dilations_0, groups = query_103_groups_0, pad = query_103_pad_0, pad_type = query_103_pad_type_0, strides = query_103_strides_0, weight = layers_17_self_attn_q_proj_weight_to_fp16_palettized, x = obj_141_cast_fp16)[name = string("query_103_cast_fp16")];
+            string current_key_69_pad_type_0 = const()[name = string("current_key_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_69_strides_0 = const()[name = string("current_key_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_69_pad_0 = const()[name = string("current_key_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_69_dilations_0 = const()[name = string("current_key_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_69_groups_0 = const()[name = string("current_key_69_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_17_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(860139136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(862236352))))[name = string("layers_17_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_69_cast_fp16 = conv(dilations = current_key_69_dilations_0, groups = current_key_69_groups_0, pad = current_key_69_pad_0, pad_type = current_key_69_pad_type_0, strides = current_key_69_strides_0, weight = layers_17_self_attn_k_proj_weight_to_fp16_palettized, x = obj_141_cast_fp16)[name = string("current_key_69_cast_fp16")];
+            string current_value_35_pad_type_0 = const()[name = string("current_value_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_35_strides_0 = const()[name = string("current_value_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_35_pad_0 = const()[name = string("current_value_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_35_dilations_0 = const()[name = string("current_value_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_35_groups_0 = const()[name = string("current_value_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_17_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(862236928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(864334144))))[name = string("layers_17_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_35_dilations_0, groups = current_value_35_groups_0, pad = current_value_35_pad_0, pad_type = current_value_35_pad_type_0, strides = current_value_35_strides_0, weight = layers_17_self_attn_v_proj_weight_to_fp16_palettized, x = obj_141_cast_fp16)[name = string("current_value_35_cast_fp16")];
+            tensor<int32, [4]> var_6810 = const()[name = string("op_6810"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_137_cast_fp16 = reshape(shape = var_6810, x = query_103_cast_fp16)[name = string("inputs_137_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_139_cast_fp16 = mul(x = inputs_137_cast_fp16, y = inputs_137_cast_fp16)[name = string("inputs_sq_139_cast_fp16")];
+            tensor<int32, [1]> variance_139_axes_0 = const()[name = string("variance_139_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_139_keep_dims_0 = const()[name = string("variance_139_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_139_cast_fp16 = reduce_mean(axes = variance_139_axes_0, keep_dims = variance_139_keep_dims_0, x = inputs_sq_139_cast_fp16)[name = string("variance_139_cast_fp16")];
+            fp16 var_6816_to_fp16 = const()[name = string("op_6816_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_6817_cast_fp16 = add(x = variance_139_cast_fp16, y = var_6816_to_fp16)[name = string("op_6817_cast_fp16")];
+            fp32 var_6818_epsilon_0 = const()[name = string("op_6818_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_6818_cast_fp16 = rsqrt(epsilon = var_6818_epsilon_0, x = var_6817_cast_fp16)[name = string("op_6818_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_173_cast_fp16 = mul(x = inputs_137_cast_fp16, y = var_6818_cast_fp16)[name = string("hidden_states_173_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_139_to_fp16 = const()[name = string("w_139_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(864334720)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_35_cast_fp16 = mul(x = w_139_to_fp16, y = hidden_states_173_cast_fp16)[name = string("query_normed_35_cast_fp16")];
+            tensor<int32, [4]> var_6826 = const()[name = string("op_6826"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_139_cast_fp16 = reshape(shape = var_6826, x = current_key_69_cast_fp16)[name = string("inputs_139_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_141_cast_fp16 = mul(x = inputs_139_cast_fp16, y = inputs_139_cast_fp16)[name = string("inputs_sq_141_cast_fp16")];
+            tensor<int32, [1]> variance_141_axes_0 = const()[name = string("variance_141_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_141_keep_dims_0 = const()[name = string("variance_141_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_141_cast_fp16 = reduce_mean(axes = variance_141_axes_0, keep_dims = variance_141_keep_dims_0, x = inputs_sq_141_cast_fp16)[name = string("variance_141_cast_fp16")];
+            fp16 var_6832_to_fp16 = const()[name = string("op_6832_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_6833_cast_fp16 = add(x = variance_141_cast_fp16, y = var_6832_to_fp16)[name = string("op_6833_cast_fp16")];
+            fp32 var_6834_epsilon_0 = const()[name = string("op_6834_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_6834_cast_fp16 = rsqrt(epsilon = var_6834_epsilon_0, x = var_6833_cast_fp16)[name = string("op_6834_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_175_cast_fp16 = mul(x = inputs_139_cast_fp16, y = var_6834_cast_fp16)[name = string("hidden_states_175_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_141_to_fp16 = const()[name = string("w_141_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(864335040)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_35_cast_fp16 = mul(x = w_141_to_fp16, y = hidden_states_175_cast_fp16)[name = string("current_key_normed_35_cast_fp16")];
+            tensor<int32, [4]> var_6852 = const()[name = string("op_6852"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_103_cast_fp16 = reshape(shape = var_6852, x = query_normed_35_cast_fp16)[name = string("mh_q_103_cast_fp16")];
+            tensor<int32, [4]> var_6854 = const()[name = string("op_6854"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_69_cast_fp16 = reshape(shape = var_6854, x = current_key_normed_35_cast_fp16)[name = string("mh_k_69_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6858_cast_fp16 = mul(x = mh_q_103_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6858_cast_fp16")];
+            tensor<int32, [4]> var_6863_begin_0 = const()[name = string("op_6863_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6863_end_0 = const()[name = string("op_6863_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_6863_end_mask_0 = const()[name = string("op_6863_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6863_cast_fp16 = slice_by_index(begin = var_6863_begin_0, end = var_6863_end_0, end_mask = var_6863_end_mask_0, x = mh_q_103_cast_fp16)[name = string("op_6863_cast_fp16")];
+            tensor<int32, [4]> var_6869_begin_0 = const()[name = string("op_6869_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6869_end_0 = const()[name = string("op_6869_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_6869_end_mask_0 = const()[name = string("op_6869_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_6869_cast_fp16 = slice_by_index(begin = var_6869_begin_0, end = var_6869_end_0, end_mask = var_6869_end_mask_0, x = mh_q_103_cast_fp16)[name = string("op_6869_cast_fp16")];
+            fp16 const_408_promoted_to_fp16 = const()[name = string("const_408_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_6871_cast_fp16 = mul(x = var_6869_cast_fp16, y = const_408_promoted_to_fp16)[name = string("op_6871_cast_fp16")];
+            bool var_6873_interleave_0 = const()[name = string("op_6873_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_6873_cast_fp16 = concat(axis = var_6751, interleave = var_6873_interleave_0, values = (var_6871_cast_fp16, var_6863_cast_fp16))[name = string("op_6873_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_6874_cast_fp16 = mul(x = var_6873_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6874_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_105_cast_fp16 = add(x = var_6858_cast_fp16, y = var_6874_cast_fp16)[name = string("mh_q_105_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6876_cast_fp16 = mul(x = mh_k_69_cast_fp16, y = cos_1_cast_fp16)[name = string("op_6876_cast_fp16")];
+            tensor<int32, [4]> var_6881_begin_0 = const()[name = string("op_6881_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6881_end_0 = const()[name = string("op_6881_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_6881_end_mask_0 = const()[name = string("op_6881_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6881_cast_fp16 = slice_by_index(begin = var_6881_begin_0, end = var_6881_end_0, end_mask = var_6881_end_mask_0, x = mh_k_69_cast_fp16)[name = string("op_6881_cast_fp16")];
+            tensor<int32, [4]> var_6887_begin_0 = const()[name = string("op_6887_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_6887_end_0 = const()[name = string("op_6887_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_6887_end_mask_0 = const()[name = string("op_6887_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_6887_cast_fp16 = slice_by_index(begin = var_6887_begin_0, end = var_6887_end_0, end_mask = var_6887_end_mask_0, x = mh_k_69_cast_fp16)[name = string("op_6887_cast_fp16")];
+            fp16 const_411_promoted_to_fp16 = const()[name = string("const_411_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_6889_cast_fp16 = mul(x = var_6887_cast_fp16, y = const_411_promoted_to_fp16)[name = string("op_6889_cast_fp16")];
+            bool var_6891_interleave_0 = const()[name = string("op_6891_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_6891_cast_fp16 = concat(axis = var_6751, interleave = var_6891_interleave_0, values = (var_6889_cast_fp16, var_6881_cast_fp16))[name = string("op_6891_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_6892_cast_fp16 = mul(x = var_6891_cast_fp16, y = sin_1_cast_fp16)[name = string("op_6892_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_71_cast_fp16 = add(x = var_6876_cast_fp16, y = var_6892_cast_fp16)[name = string("mh_k_71_cast_fp16")];
+            tensor<int32, [4]> var_6896 = const()[name = string("op_6896"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_71_cast_fp16 = reshape(shape = var_6896, x = mh_k_71_cast_fp16)[name = string("current_key_71_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6903_cast_fp16 = mul(x = var_101_cast_fp16_17, y = var_323_cast_fp16)[name = string("op_6903_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6904_cast_fp16 = mul(x = current_key_71_cast_fp16, y = var_321_cast_fp16)[name = string("op_6904_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_105_cast_fp16 = add(x = var_6903_cast_fp16, y = var_6904_cast_fp16)[name = string("key_105_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6907_cast_fp16 = mul(x = var_132_cast_fp16_17, y = var_323_cast_fp16)[name = string("op_6907_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_6908_cast_fp16 = mul(x = current_value_35_cast_fp16, y = var_321_cast_fp16)[name = string("op_6908_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_69_cast_fp16 = add(x = var_6907_cast_fp16, y = var_6908_cast_fp16)[name = string("value_69_cast_fp16")];
+            tensor<int32, [4]> var_6912 = const()[name = string("op_6912"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_69_cast_fp16 = reshape(shape = var_6912, x = key_105_cast_fp16)[name = string("key_heads_69_cast_fp16")];
+            tensor<int32, [4]> var_6914 = const()[name = string("op_6914"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_69_cast_fp16 = reshape(shape = var_6914, x = value_69_cast_fp16)[name = string("value_heads_69_cast_fp16")];
+            tensor<int32, [4]> var_6917_begin_0 = const()[name = string("op_6917_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6917_end_0 = const()[name = string("op_6917_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6917_end_mask_0 = const()[name = string("op_6917_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6917_cast_fp16 = slice_by_index(begin = var_6917_begin_0, end = var_6917_end_0, end_mask = var_6917_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6917_cast_fp16")];
+            tensor<int32, [4]> var_6921_begin_0 = const()[name = string("op_6921_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6921_end_0 = const()[name = string("op_6921_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_6921_end_mask_0 = const()[name = string("op_6921_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6921_cast_fp16 = slice_by_index(begin = var_6921_begin_0, end = var_6921_end_0, end_mask = var_6921_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6921_cast_fp16")];
+            tensor<int32, [4]> var_6933_begin_0 = const()[name = string("op_6933_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6933_end_0 = const()[name = string("op_6933_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6933_end_mask_0 = const()[name = string("op_6933_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6933_cast_fp16 = slice_by_index(begin = var_6933_begin_0, end = var_6933_end_0, end_mask = var_6933_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6933_cast_fp16")];
+            tensor<int32, [4]> var_6937_begin_0 = const()[name = string("op_6937_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_6937_end_0 = const()[name = string("op_6937_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_6937_end_mask_0 = const()[name = string("op_6937_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6937_cast_fp16 = slice_by_index(begin = var_6937_begin_0, end = var_6937_end_0, end_mask = var_6937_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6937_cast_fp16")];
+            tensor<int32, [4]> var_6949_begin_0 = const()[name = string("op_6949_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6949_end_0 = const()[name = string("op_6949_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6949_end_mask_0 = const()[name = string("op_6949_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6949_cast_fp16 = slice_by_index(begin = var_6949_begin_0, end = var_6949_end_0, end_mask = var_6949_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6949_cast_fp16")];
+            tensor<int32, [4]> var_6953_begin_0 = const()[name = string("op_6953_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_6953_end_0 = const()[name = string("op_6953_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_6953_end_mask_0 = const()[name = string("op_6953_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6953_cast_fp16 = slice_by_index(begin = var_6953_begin_0, end = var_6953_end_0, end_mask = var_6953_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6953_cast_fp16")];
+            tensor<int32, [4]> var_6965_begin_0 = const()[name = string("op_6965_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6965_end_0 = const()[name = string("op_6965_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6965_end_mask_0 = const()[name = string("op_6965_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6965_cast_fp16 = slice_by_index(begin = var_6965_begin_0, end = var_6965_end_0, end_mask = var_6965_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6965_cast_fp16")];
+            tensor<int32, [4]> var_6969_begin_0 = const()[name = string("op_6969_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_6969_end_0 = const()[name = string("op_6969_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_6969_end_mask_0 = const()[name = string("op_6969_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6969_cast_fp16 = slice_by_index(begin = var_6969_begin_0, end = var_6969_end_0, end_mask = var_6969_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6969_cast_fp16")];
+            tensor<int32, [4]> var_6981_begin_0 = const()[name = string("op_6981_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6981_end_0 = const()[name = string("op_6981_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6981_end_mask_0 = const()[name = string("op_6981_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6981_cast_fp16 = slice_by_index(begin = var_6981_begin_0, end = var_6981_end_0, end_mask = var_6981_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6981_cast_fp16")];
+            tensor<int32, [4]> var_6985_begin_0 = const()[name = string("op_6985_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_6985_end_0 = const()[name = string("op_6985_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_6985_end_mask_0 = const()[name = string("op_6985_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6985_cast_fp16 = slice_by_index(begin = var_6985_begin_0, end = var_6985_end_0, end_mask = var_6985_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_6985_cast_fp16")];
+            tensor<int32, [4]> var_6997_begin_0 = const()[name = string("op_6997_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_6997_end_0 = const()[name = string("op_6997_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_6997_end_mask_0 = const()[name = string("op_6997_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_6997_cast_fp16 = slice_by_index(begin = var_6997_begin_0, end = var_6997_end_0, end_mask = var_6997_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_6997_cast_fp16")];
+            tensor<int32, [4]> var_7001_begin_0 = const()[name = string("op_7001_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7001_end_0 = const()[name = string("op_7001_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7001_end_mask_0 = const()[name = string("op_7001_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7001_cast_fp16 = slice_by_index(begin = var_7001_begin_0, end = var_7001_end_0, end_mask = var_7001_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_7001_cast_fp16")];
+            tensor<int32, [4]> var_7013_begin_0 = const()[name = string("op_7013_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7013_end_0 = const()[name = string("op_7013_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7013_end_mask_0 = const()[name = string("op_7013_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7013_cast_fp16 = slice_by_index(begin = var_7013_begin_0, end = var_7013_end_0, end_mask = var_7013_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_7013_cast_fp16")];
+            tensor<int32, [4]> var_7017_begin_0 = const()[name = string("op_7017_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7017_end_0 = const()[name = string("op_7017_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7017_end_mask_0 = const()[name = string("op_7017_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7017_cast_fp16 = slice_by_index(begin = var_7017_begin_0, end = var_7017_end_0, end_mask = var_7017_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_7017_cast_fp16")];
+            tensor<int32, [4]> var_7029_begin_0 = const()[name = string("op_7029_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7029_end_0 = const()[name = string("op_7029_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7029_end_mask_0 = const()[name = string("op_7029_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7029_cast_fp16 = slice_by_index(begin = var_7029_begin_0, end = var_7029_end_0, end_mask = var_7029_end_mask_0, x = key_heads_69_cast_fp16)[name = string("op_7029_cast_fp16")];
+            tensor<int32, [4]> var_7033_begin_0 = const()[name = string("op_7033_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7033_end_0 = const()[name = string("op_7033_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7033_end_mask_0 = const()[name = string("op_7033_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7033_cast_fp16 = slice_by_index(begin = var_7033_begin_0, end = var_7033_end_0, end_mask = var_7033_end_mask_0, x = value_heads_69_cast_fp16)[name = string("op_7033_cast_fp16")];
+            bool key_heads_71_interleave_0 = const()[name = string("key_heads_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_71_cast_fp16 = concat(axis = var_6759, interleave = key_heads_71_interleave_0, values = (var_6917_cast_fp16, var_6917_cast_fp16, var_6933_cast_fp16, var_6933_cast_fp16, var_6949_cast_fp16, var_6949_cast_fp16, var_6965_cast_fp16, var_6965_cast_fp16, var_6981_cast_fp16, var_6981_cast_fp16, var_6997_cast_fp16, var_6997_cast_fp16, var_7013_cast_fp16, var_7013_cast_fp16, var_7029_cast_fp16, var_7029_cast_fp16))[name = string("key_heads_71_cast_fp16")];
+            bool value_heads_71_interleave_0 = const()[name = string("value_heads_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_71_cast_fp16 = concat(axis = var_6759, interleave = value_heads_71_interleave_0, values = (var_6921_cast_fp16, var_6921_cast_fp16, var_6937_cast_fp16, var_6937_cast_fp16, var_6953_cast_fp16, var_6953_cast_fp16, var_6969_cast_fp16, var_6969_cast_fp16, var_6985_cast_fp16, var_6985_cast_fp16, var_7001_cast_fp16, var_7001_cast_fp16, var_7017_cast_fp16, var_7017_cast_fp16, var_7033_cast_fp16, var_7033_cast_fp16))[name = string("value_heads_71_cast_fp16")];
+            fp16 var_7056_to_fp16 = const()[name = string("op_7056_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_7057_cast_fp16 = mul(x = mh_q_105_cast_fp16, y = var_7056_to_fp16)[name = string("op_7057_cast_fp16")];
+            bool mh_w_69_transpose_x_0 = const()[name = string("mh_w_69_transpose_x_0"), val = bool(true)];
+            bool mh_w_69_transpose_y_0 = const()[name = string("mh_w_69_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_69_cast_fp16 = matmul(transpose_x = mh_w_69_transpose_x_0, transpose_y = mh_w_69_transpose_y_0, x = var_7057_cast_fp16, y = key_heads_71_cast_fp16)[name = string("mh_w_69_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_71_cast_fp16 = add(x = mh_w_69_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_71_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_7069_cast_fp16 = softmax(axis = var_6741, x = mh_w_71_cast_fp16)[name = string("op_7069_cast_fp16")];
+            bool attn_35_transpose_x_0 = const()[name = string("attn_35_transpose_x_0"), val = bool(false)];
+            bool attn_35_transpose_y_0 = const()[name = string("attn_35_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_35_cast_fp16 = matmul(transpose_x = attn_35_transpose_x_0, transpose_y = attn_35_transpose_y_0, x = value_heads_71_cast_fp16, y = var_7069_cast_fp16)[name = string("attn_35_cast_fp16")];
+            tensor<int32, [4]> var_7074 = const()[name = string("op_7074"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_137_cast_fp16 = reshape(shape = var_7074, x = attn_35_cast_fp16)[name = string("input_137_cast_fp16")];
+            string obj_147_pad_type_0 = const()[name = string("obj_147_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_147_strides_0 = const()[name = string("obj_147_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_147_pad_0 = const()[name = string("obj_147_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_147_dilations_0 = const()[name = string("obj_147_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_147_groups_0 = const()[name = string("obj_147_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_17_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(864335360))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(868529728))))[name = string("layers_17_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_147_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_147_dilations_0, groups = obj_147_groups_0, pad = obj_147_pad_0, pad_type = obj_147_pad_type_0, strides = obj_147_strides_0, weight = layers_17_self_attn_o_proj_weight_to_fp16_palettized, x = input_137_cast_fp16)[name = string("obj_147_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_141_cast_fp16 = add(x = inputs_135_cast_fp16, y = obj_147_cast_fp16)[name = string("inputs_141_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_143_cast_fp16 = mul(x = inputs_141_cast_fp16, y = inputs_141_cast_fp16)[name = string("inputs_sq_143_cast_fp16")];
+            tensor<int32, [1]> variance_143_axes_0 = const()[name = string("variance_143_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_143_keep_dims_0 = const()[name = string("variance_143_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_143_cast_fp16 = reduce_mean(axes = variance_143_axes_0, keep_dims = variance_143_keep_dims_0, x = inputs_sq_143_cast_fp16)[name = string("variance_143_cast_fp16")];
+            fp16 var_7092_to_fp16 = const()[name = string("op_7092_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7093_cast_fp16 = add(x = variance_143_cast_fp16, y = var_7092_to_fp16)[name = string("op_7093_cast_fp16")];
+            fp32 var_7094_epsilon_0 = const()[name = string("op_7094_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7094_cast_fp16 = rsqrt(epsilon = var_7094_epsilon_0, x = var_7093_cast_fp16)[name = string("op_7094_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_177_cast_fp16 = mul(x = inputs_141_cast_fp16, y = var_7094_cast_fp16)[name = string("hidden_states_177_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_143_to_fp16 = const()[name = string("w_143_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(868530304)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_139_cast_fp16 = mul(x = w_143_to_fp16, y = hidden_states_177_cast_fp16)[name = string("input_139_cast_fp16")];
+            string input_141_pad_type_0 = const()[name = string("input_141_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_141_strides_0 = const()[name = string("input_141_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_141_pad_0 = const()[name = string("input_141_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_141_dilations_0 = const()[name = string("input_141_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_141_groups_0 = const()[name = string("input_141_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_17_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(868534464))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(881117440))))[name = string("layers_17_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_141_cast_fp16 = conv(dilations = input_141_dilations_0, groups = input_141_groups_0, pad = input_141_pad_0, pad_type = input_141_pad_type_0, strides = input_141_strides_0, weight = layers_17_mlp_gate_proj_weight_to_fp16_palettized, x = input_139_cast_fp16)[name = string("input_141_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_7108_cast_fp16 = silu(x = input_141_cast_fp16)[name = string("op_7108_cast_fp16")];
+            string var_7114_pad_type_0 = const()[name = string("op_7114_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7114_strides_0 = const()[name = string("op_7114_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7114_pad_0 = const()[name = string("op_7114_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7114_dilations_0 = const()[name = string("op_7114_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7114_groups_0 = const()[name = string("op_7114_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_17_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(881118016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(893700992))))[name = string("layers_17_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_7114_cast_fp16 = conv(dilations = var_7114_dilations_0, groups = var_7114_groups_0, pad = var_7114_pad_0, pad_type = var_7114_pad_type_0, strides = var_7114_strides_0, weight = layers_17_mlp_up_proj_weight_to_fp16_palettized, x = input_139_cast_fp16)[name = string("op_7114_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_143_cast_fp16 = mul(x = var_7108_cast_fp16, y = var_7114_cast_fp16)[name = string("input_143_cast_fp16")];
+            string hidden_states_179_pad_type_0 = const()[name = string("hidden_states_179_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_179_strides_0 = const()[name = string("hidden_states_179_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_179_pad_0 = const()[name = string("hidden_states_179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_179_dilations_0 = const()[name = string("hidden_states_179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_179_groups_0 = const()[name = string("hidden_states_179_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_17_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(893701568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(906284544))))[name = string("layers_17_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_179_cast_fp16 = conv(dilations = hidden_states_179_dilations_0, groups = hidden_states_179_groups_0, pad = hidden_states_179_pad_0, pad_type = hidden_states_179_pad_type_0, strides = hidden_states_179_strides_0, weight = layers_17_mlp_down_proj_weight_to_fp16_palettized, x = input_143_cast_fp16)[name = string("hidden_states_179_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_143_cast_fp16 = add(x = inputs_141_cast_fp16, y = hidden_states_179_cast_fp16)[name = string("inputs_143_cast_fp16")];
+            int32 var_7128 = const()[name = string("op_7128"), val = int32(3)];
+            int32 var_7138 = const()[name = string("op_7138"), val = int32(-2)];
+            int32 var_7146 = const()[name = string("op_7146"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_145_cast_fp16 = mul(x = inputs_143_cast_fp16, y = inputs_143_cast_fp16)[name = string("inputs_sq_145_cast_fp16")];
+            tensor<int32, [1]> variance_145_axes_0 = const()[name = string("variance_145_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_145_keep_dims_0 = const()[name = string("variance_145_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_145_cast_fp16 = reduce_mean(axes = variance_145_axes_0, keep_dims = variance_145_keep_dims_0, x = inputs_sq_145_cast_fp16)[name = string("variance_145_cast_fp16")];
+            fp16 var_7158_to_fp16 = const()[name = string("op_7158_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7159_cast_fp16 = add(x = variance_145_cast_fp16, y = var_7158_to_fp16)[name = string("op_7159_cast_fp16")];
+            fp32 var_7160_epsilon_0 = const()[name = string("op_7160_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7160_cast_fp16 = rsqrt(epsilon = var_7160_epsilon_0, x = var_7159_cast_fp16)[name = string("op_7160_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_181_cast_fp16 = mul(x = inputs_143_cast_fp16, y = var_7160_cast_fp16)[name = string("hidden_states_181_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_145_to_fp16 = const()[name = string("w_145_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(906285120)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_149_cast_fp16 = mul(x = w_145_to_fp16, y = hidden_states_181_cast_fp16)[name = string("obj_149_cast_fp16")];
+            string query_109_pad_type_0 = const()[name = string("query_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_109_strides_0 = const()[name = string("query_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_109_pad_0 = const()[name = string("query_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_109_dilations_0 = const()[name = string("query_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_109_groups_0 = const()[name = string("query_109_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_18_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(906289280))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(910483648))))[name = string("layers_18_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_109_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_109_dilations_0, groups = query_109_groups_0, pad = query_109_pad_0, pad_type = query_109_pad_type_0, strides = query_109_strides_0, weight = layers_18_self_attn_q_proj_weight_to_fp16_palettized, x = obj_149_cast_fp16)[name = string("query_109_cast_fp16")];
+            string current_key_73_pad_type_0 = const()[name = string("current_key_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_73_strides_0 = const()[name = string("current_key_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_73_pad_0 = const()[name = string("current_key_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_73_dilations_0 = const()[name = string("current_key_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_73_groups_0 = const()[name = string("current_key_73_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_18_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(910484224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(912581440))))[name = string("layers_18_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_73_cast_fp16 = conv(dilations = current_key_73_dilations_0, groups = current_key_73_groups_0, pad = current_key_73_pad_0, pad_type = current_key_73_pad_type_0, strides = current_key_73_strides_0, weight = layers_18_self_attn_k_proj_weight_to_fp16_palettized, x = obj_149_cast_fp16)[name = string("current_key_73_cast_fp16")];
+            string current_value_37_pad_type_0 = const()[name = string("current_value_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_37_strides_0 = const()[name = string("current_value_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_37_pad_0 = const()[name = string("current_value_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_37_dilations_0 = const()[name = string("current_value_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_37_groups_0 = const()[name = string("current_value_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_18_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(912582016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914679232))))[name = string("layers_18_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_37_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_37_dilations_0, groups = current_value_37_groups_0, pad = current_value_37_pad_0, pad_type = current_value_37_pad_type_0, strides = current_value_37_strides_0, weight = layers_18_self_attn_v_proj_weight_to_fp16_palettized, x = obj_149_cast_fp16)[name = string("current_value_37_cast_fp16")];
+            tensor<int32, [4]> var_7197 = const()[name = string("op_7197"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_145_cast_fp16 = reshape(shape = var_7197, x = query_109_cast_fp16)[name = string("inputs_145_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_147_cast_fp16 = mul(x = inputs_145_cast_fp16, y = inputs_145_cast_fp16)[name = string("inputs_sq_147_cast_fp16")];
+            tensor<int32, [1]> variance_147_axes_0 = const()[name = string("variance_147_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_147_keep_dims_0 = const()[name = string("variance_147_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_147_cast_fp16 = reduce_mean(axes = variance_147_axes_0, keep_dims = variance_147_keep_dims_0, x = inputs_sq_147_cast_fp16)[name = string("variance_147_cast_fp16")];
+            fp16 var_7203_to_fp16 = const()[name = string("op_7203_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_7204_cast_fp16 = add(x = variance_147_cast_fp16, y = var_7203_to_fp16)[name = string("op_7204_cast_fp16")];
+            fp32 var_7205_epsilon_0 = const()[name = string("op_7205_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_7205_cast_fp16 = rsqrt(epsilon = var_7205_epsilon_0, x = var_7204_cast_fp16)[name = string("op_7205_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_183_cast_fp16 = mul(x = inputs_145_cast_fp16, y = var_7205_cast_fp16)[name = string("hidden_states_183_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_147_to_fp16 = const()[name = string("w_147_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914679808)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_37_cast_fp16 = mul(x = w_147_to_fp16, y = hidden_states_183_cast_fp16)[name = string("query_normed_37_cast_fp16")];
+            tensor<int32, [4]> var_7213 = const()[name = string("op_7213"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_147_cast_fp16 = reshape(shape = var_7213, x = current_key_73_cast_fp16)[name = string("inputs_147_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_149_cast_fp16 = mul(x = inputs_147_cast_fp16, y = inputs_147_cast_fp16)[name = string("inputs_sq_149_cast_fp16")];
+            tensor<int32, [1]> variance_149_axes_0 = const()[name = string("variance_149_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_149_keep_dims_0 = const()[name = string("variance_149_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_149_cast_fp16 = reduce_mean(axes = variance_149_axes_0, keep_dims = variance_149_keep_dims_0, x = inputs_sq_149_cast_fp16)[name = string("variance_149_cast_fp16")];
+            fp16 var_7219_to_fp16 = const()[name = string("op_7219_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_7220_cast_fp16 = add(x = variance_149_cast_fp16, y = var_7219_to_fp16)[name = string("op_7220_cast_fp16")];
+            fp32 var_7221_epsilon_0 = const()[name = string("op_7221_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_7221_cast_fp16 = rsqrt(epsilon = var_7221_epsilon_0, x = var_7220_cast_fp16)[name = string("op_7221_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_185_cast_fp16 = mul(x = inputs_147_cast_fp16, y = var_7221_cast_fp16)[name = string("hidden_states_185_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_149_to_fp16 = const()[name = string("w_149_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914680128)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_37_cast_fp16 = mul(x = w_149_to_fp16, y = hidden_states_185_cast_fp16)[name = string("current_key_normed_37_cast_fp16")];
+            tensor<int32, [4]> var_7239 = const()[name = string("op_7239"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_109_cast_fp16 = reshape(shape = var_7239, x = query_normed_37_cast_fp16)[name = string("mh_q_109_cast_fp16")];
+            tensor<int32, [4]> var_7241 = const()[name = string("op_7241"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_73_cast_fp16 = reshape(shape = var_7241, x = current_key_normed_37_cast_fp16)[name = string("mh_k_73_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7245_cast_fp16 = mul(x = mh_q_109_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7245_cast_fp16")];
+            tensor<int32, [4]> var_7250_begin_0 = const()[name = string("op_7250_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7250_end_0 = const()[name = string("op_7250_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_7250_end_mask_0 = const()[name = string("op_7250_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7250_cast_fp16 = slice_by_index(begin = var_7250_begin_0, end = var_7250_end_0, end_mask = var_7250_end_mask_0, x = mh_q_109_cast_fp16)[name = string("op_7250_cast_fp16")];
+            tensor<int32, [4]> var_7256_begin_0 = const()[name = string("op_7256_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7256_end_0 = const()[name = string("op_7256_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_7256_end_mask_0 = const()[name = string("op_7256_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7256_cast_fp16 = slice_by_index(begin = var_7256_begin_0, end = var_7256_end_0, end_mask = var_7256_end_mask_0, x = mh_q_109_cast_fp16)[name = string("op_7256_cast_fp16")];
+            fp16 const_431_promoted_to_fp16 = const()[name = string("const_431_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_7258_cast_fp16 = mul(x = var_7256_cast_fp16, y = const_431_promoted_to_fp16)[name = string("op_7258_cast_fp16")];
+            bool var_7260_interleave_0 = const()[name = string("op_7260_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_7260_cast_fp16 = concat(axis = var_7138, interleave = var_7260_interleave_0, values = (var_7258_cast_fp16, var_7250_cast_fp16))[name = string("op_7260_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7261_cast_fp16 = mul(x = var_7260_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7261_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_111_cast_fp16 = add(x = var_7245_cast_fp16, y = var_7261_cast_fp16)[name = string("mh_q_111_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7263_cast_fp16 = mul(x = mh_k_73_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7263_cast_fp16")];
+            tensor<int32, [4]> var_7268_begin_0 = const()[name = string("op_7268_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7268_end_0 = const()[name = string("op_7268_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_7268_end_mask_0 = const()[name = string("op_7268_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7268_cast_fp16 = slice_by_index(begin = var_7268_begin_0, end = var_7268_end_0, end_mask = var_7268_end_mask_0, x = mh_k_73_cast_fp16)[name = string("op_7268_cast_fp16")];
+            tensor<int32, [4]> var_7274_begin_0 = const()[name = string("op_7274_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7274_end_0 = const()[name = string("op_7274_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_7274_end_mask_0 = const()[name = string("op_7274_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7274_cast_fp16 = slice_by_index(begin = var_7274_begin_0, end = var_7274_end_0, end_mask = var_7274_end_mask_0, x = mh_k_73_cast_fp16)[name = string("op_7274_cast_fp16")];
+            fp16 const_434_promoted_to_fp16 = const()[name = string("const_434_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_7276_cast_fp16 = mul(x = var_7274_cast_fp16, y = const_434_promoted_to_fp16)[name = string("op_7276_cast_fp16")];
+            bool var_7278_interleave_0 = const()[name = string("op_7278_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_7278_cast_fp16 = concat(axis = var_7138, interleave = var_7278_interleave_0, values = (var_7276_cast_fp16, var_7268_cast_fp16))[name = string("op_7278_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7279_cast_fp16 = mul(x = var_7278_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7279_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_75_cast_fp16 = add(x = var_7263_cast_fp16, y = var_7279_cast_fp16)[name = string("mh_k_75_cast_fp16")];
+            tensor<int32, [4]> var_7283 = const()[name = string("op_7283"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_75_cast_fp16 = reshape(shape = var_7283, x = mh_k_75_cast_fp16)[name = string("current_key_75_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7290_cast_fp16 = mul(x = var_101_cast_fp16_18, y = var_323_cast_fp16)[name = string("op_7290_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7291_cast_fp16 = mul(x = current_key_75_cast_fp16, y = var_321_cast_fp16)[name = string("op_7291_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_111_cast_fp16 = add(x = var_7290_cast_fp16, y = var_7291_cast_fp16)[name = string("key_111_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7294_cast_fp16 = mul(x = var_132_cast_fp16_18, y = var_323_cast_fp16)[name = string("op_7294_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7295_cast_fp16 = mul(x = current_value_37_cast_fp16, y = var_321_cast_fp16)[name = string("op_7295_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_73_cast_fp16 = add(x = var_7294_cast_fp16, y = var_7295_cast_fp16)[name = string("value_73_cast_fp16")];
+            tensor<int32, [4]> var_7299 = const()[name = string("op_7299"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_73_cast_fp16 = reshape(shape = var_7299, x = key_111_cast_fp16)[name = string("key_heads_73_cast_fp16")];
+            tensor<int32, [4]> var_7301 = const()[name = string("op_7301"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_73_cast_fp16 = reshape(shape = var_7301, x = value_73_cast_fp16)[name = string("value_heads_73_cast_fp16")];
+            tensor<int32, [4]> var_7304_begin_0 = const()[name = string("op_7304_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7304_end_0 = const()[name = string("op_7304_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7304_end_mask_0 = const()[name = string("op_7304_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7304_cast_fp16 = slice_by_index(begin = var_7304_begin_0, end = var_7304_end_0, end_mask = var_7304_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7304_cast_fp16")];
+            tensor<int32, [4]> var_7308_begin_0 = const()[name = string("op_7308_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7308_end_0 = const()[name = string("op_7308_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7308_end_mask_0 = const()[name = string("op_7308_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7308_cast_fp16 = slice_by_index(begin = var_7308_begin_0, end = var_7308_end_0, end_mask = var_7308_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7308_cast_fp16")];
+            tensor<int32, [4]> var_7320_begin_0 = const()[name = string("op_7320_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7320_end_0 = const()[name = string("op_7320_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7320_end_mask_0 = const()[name = string("op_7320_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7320_cast_fp16 = slice_by_index(begin = var_7320_begin_0, end = var_7320_end_0, end_mask = var_7320_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7320_cast_fp16")];
+            tensor<int32, [4]> var_7324_begin_0 = const()[name = string("op_7324_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7324_end_0 = const()[name = string("op_7324_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7324_end_mask_0 = const()[name = string("op_7324_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7324_cast_fp16 = slice_by_index(begin = var_7324_begin_0, end = var_7324_end_0, end_mask = var_7324_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7324_cast_fp16")];
+            tensor<int32, [4]> var_7336_begin_0 = const()[name = string("op_7336_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7336_end_0 = const()[name = string("op_7336_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7336_end_mask_0 = const()[name = string("op_7336_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7336_cast_fp16 = slice_by_index(begin = var_7336_begin_0, end = var_7336_end_0, end_mask = var_7336_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7336_cast_fp16")];
+            tensor<int32, [4]> var_7340_begin_0 = const()[name = string("op_7340_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7340_end_0 = const()[name = string("op_7340_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7340_end_mask_0 = const()[name = string("op_7340_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7340_cast_fp16 = slice_by_index(begin = var_7340_begin_0, end = var_7340_end_0, end_mask = var_7340_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7340_cast_fp16")];
+            tensor<int32, [4]> var_7352_begin_0 = const()[name = string("op_7352_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7352_end_0 = const()[name = string("op_7352_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7352_end_mask_0 = const()[name = string("op_7352_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7352_cast_fp16 = slice_by_index(begin = var_7352_begin_0, end = var_7352_end_0, end_mask = var_7352_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7352_cast_fp16")];
+            tensor<int32, [4]> var_7356_begin_0 = const()[name = string("op_7356_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7356_end_0 = const()[name = string("op_7356_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7356_end_mask_0 = const()[name = string("op_7356_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7356_cast_fp16 = slice_by_index(begin = var_7356_begin_0, end = var_7356_end_0, end_mask = var_7356_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7356_cast_fp16")];
+            tensor<int32, [4]> var_7368_begin_0 = const()[name = string("op_7368_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7368_end_0 = const()[name = string("op_7368_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7368_end_mask_0 = const()[name = string("op_7368_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7368_cast_fp16 = slice_by_index(begin = var_7368_begin_0, end = var_7368_end_0, end_mask = var_7368_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7368_cast_fp16")];
+            tensor<int32, [4]> var_7372_begin_0 = const()[name = string("op_7372_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7372_end_0 = const()[name = string("op_7372_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7372_end_mask_0 = const()[name = string("op_7372_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7372_cast_fp16 = slice_by_index(begin = var_7372_begin_0, end = var_7372_end_0, end_mask = var_7372_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7372_cast_fp16")];
+            tensor<int32, [4]> var_7384_begin_0 = const()[name = string("op_7384_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7384_end_0 = const()[name = string("op_7384_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7384_end_mask_0 = const()[name = string("op_7384_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7384_cast_fp16 = slice_by_index(begin = var_7384_begin_0, end = var_7384_end_0, end_mask = var_7384_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7384_cast_fp16")];
+            tensor<int32, [4]> var_7388_begin_0 = const()[name = string("op_7388_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7388_end_0 = const()[name = string("op_7388_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7388_end_mask_0 = const()[name = string("op_7388_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7388_cast_fp16 = slice_by_index(begin = var_7388_begin_0, end = var_7388_end_0, end_mask = var_7388_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7388_cast_fp16")];
+            tensor<int32, [4]> var_7400_begin_0 = const()[name = string("op_7400_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7400_end_0 = const()[name = string("op_7400_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7400_end_mask_0 = const()[name = string("op_7400_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7400_cast_fp16 = slice_by_index(begin = var_7400_begin_0, end = var_7400_end_0, end_mask = var_7400_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7400_cast_fp16")];
+            tensor<int32, [4]> var_7404_begin_0 = const()[name = string("op_7404_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7404_end_0 = const()[name = string("op_7404_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7404_end_mask_0 = const()[name = string("op_7404_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7404_cast_fp16 = slice_by_index(begin = var_7404_begin_0, end = var_7404_end_0, end_mask = var_7404_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7404_cast_fp16")];
+            tensor<int32, [4]> var_7416_begin_0 = const()[name = string("op_7416_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7416_end_0 = const()[name = string("op_7416_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7416_end_mask_0 = const()[name = string("op_7416_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7416_cast_fp16 = slice_by_index(begin = var_7416_begin_0, end = var_7416_end_0, end_mask = var_7416_end_mask_0, x = key_heads_73_cast_fp16)[name = string("op_7416_cast_fp16")];
+            tensor<int32, [4]> var_7420_begin_0 = const()[name = string("op_7420_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7420_end_0 = const()[name = string("op_7420_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7420_end_mask_0 = const()[name = string("op_7420_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7420_cast_fp16 = slice_by_index(begin = var_7420_begin_0, end = var_7420_end_0, end_mask = var_7420_end_mask_0, x = value_heads_73_cast_fp16)[name = string("op_7420_cast_fp16")];
+            bool key_heads_75_interleave_0 = const()[name = string("key_heads_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_75_cast_fp16 = concat(axis = var_7146, interleave = key_heads_75_interleave_0, values = (var_7304_cast_fp16, var_7304_cast_fp16, var_7320_cast_fp16, var_7320_cast_fp16, var_7336_cast_fp16, var_7336_cast_fp16, var_7352_cast_fp16, var_7352_cast_fp16, var_7368_cast_fp16, var_7368_cast_fp16, var_7384_cast_fp16, var_7384_cast_fp16, var_7400_cast_fp16, var_7400_cast_fp16, var_7416_cast_fp16, var_7416_cast_fp16))[name = string("key_heads_75_cast_fp16")];
+            bool value_heads_75_interleave_0 = const()[name = string("value_heads_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_75_cast_fp16 = concat(axis = var_7146, interleave = value_heads_75_interleave_0, values = (var_7308_cast_fp16, var_7308_cast_fp16, var_7324_cast_fp16, var_7324_cast_fp16, var_7340_cast_fp16, var_7340_cast_fp16, var_7356_cast_fp16, var_7356_cast_fp16, var_7372_cast_fp16, var_7372_cast_fp16, var_7388_cast_fp16, var_7388_cast_fp16, var_7404_cast_fp16, var_7404_cast_fp16, var_7420_cast_fp16, var_7420_cast_fp16))[name = string("value_heads_75_cast_fp16")];
+            fp16 var_7443_to_fp16 = const()[name = string("op_7443_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_7444_cast_fp16 = mul(x = mh_q_111_cast_fp16, y = var_7443_to_fp16)[name = string("op_7444_cast_fp16")];
+            bool mh_w_73_transpose_x_0 = const()[name = string("mh_w_73_transpose_x_0"), val = bool(true)];
+            bool mh_w_73_transpose_y_0 = const()[name = string("mh_w_73_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_73_cast_fp16 = matmul(transpose_x = mh_w_73_transpose_x_0, transpose_y = mh_w_73_transpose_y_0, x = var_7444_cast_fp16, y = key_heads_75_cast_fp16)[name = string("mh_w_73_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_75_cast_fp16 = add(x = mh_w_73_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_75_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_7456_cast_fp16 = softmax(axis = var_7128, x = mh_w_75_cast_fp16)[name = string("op_7456_cast_fp16")];
+            bool attn_37_transpose_x_0 = const()[name = string("attn_37_transpose_x_0"), val = bool(false)];
+            bool attn_37_transpose_y_0 = const()[name = string("attn_37_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_37_cast_fp16 = matmul(transpose_x = attn_37_transpose_x_0, transpose_y = attn_37_transpose_y_0, x = value_heads_75_cast_fp16, y = var_7456_cast_fp16)[name = string("attn_37_cast_fp16")];
+            tensor<int32, [4]> var_7461 = const()[name = string("op_7461"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_145_cast_fp16 = reshape(shape = var_7461, x = attn_37_cast_fp16)[name = string("input_145_cast_fp16")];
+            string obj_155_pad_type_0 = const()[name = string("obj_155_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_155_strides_0 = const()[name = string("obj_155_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_155_pad_0 = const()[name = string("obj_155_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_155_dilations_0 = const()[name = string("obj_155_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_155_groups_0 = const()[name = string("obj_155_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_18_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914680448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(918874816))))[name = string("layers_18_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_155_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_155_dilations_0, groups = obj_155_groups_0, pad = obj_155_pad_0, pad_type = obj_155_pad_type_0, strides = obj_155_strides_0, weight = layers_18_self_attn_o_proj_weight_to_fp16_palettized, x = input_145_cast_fp16)[name = string("obj_155_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_149_cast_fp16 = add(x = inputs_143_cast_fp16, y = obj_155_cast_fp16)[name = string("inputs_149_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_151_cast_fp16 = mul(x = inputs_149_cast_fp16, y = inputs_149_cast_fp16)[name = string("inputs_sq_151_cast_fp16")];
+            tensor<int32, [1]> variance_151_axes_0 = const()[name = string("variance_151_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_151_keep_dims_0 = const()[name = string("variance_151_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_151_cast_fp16 = reduce_mean(axes = variance_151_axes_0, keep_dims = variance_151_keep_dims_0, x = inputs_sq_151_cast_fp16)[name = string("variance_151_cast_fp16")];
+            fp16 var_7479_to_fp16 = const()[name = string("op_7479_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7480_cast_fp16 = add(x = variance_151_cast_fp16, y = var_7479_to_fp16)[name = string("op_7480_cast_fp16")];
+            fp32 var_7481_epsilon_0 = const()[name = string("op_7481_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7481_cast_fp16 = rsqrt(epsilon = var_7481_epsilon_0, x = var_7480_cast_fp16)[name = string("op_7481_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_187_cast_fp16 = mul(x = inputs_149_cast_fp16, y = var_7481_cast_fp16)[name = string("hidden_states_187_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_151_to_fp16 = const()[name = string("w_151_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(918875392)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_147_cast_fp16 = mul(x = w_151_to_fp16, y = hidden_states_187_cast_fp16)[name = string("input_147_cast_fp16")];
+            string input_149_pad_type_0 = const()[name = string("input_149_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_149_strides_0 = const()[name = string("input_149_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_149_pad_0 = const()[name = string("input_149_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_149_dilations_0 = const()[name = string("input_149_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_149_groups_0 = const()[name = string("input_149_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_18_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(918879552))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(931462528))))[name = string("layers_18_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_149_cast_fp16 = conv(dilations = input_149_dilations_0, groups = input_149_groups_0, pad = input_149_pad_0, pad_type = input_149_pad_type_0, strides = input_149_strides_0, weight = layers_18_mlp_gate_proj_weight_to_fp16_palettized, x = input_147_cast_fp16)[name = string("input_149_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_7495_cast_fp16 = silu(x = input_149_cast_fp16)[name = string("op_7495_cast_fp16")];
+            string var_7501_pad_type_0 = const()[name = string("op_7501_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7501_strides_0 = const()[name = string("op_7501_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7501_pad_0 = const()[name = string("op_7501_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7501_dilations_0 = const()[name = string("op_7501_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7501_groups_0 = const()[name = string("op_7501_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_18_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(931463104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(944046080))))[name = string("layers_18_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_7501_cast_fp16 = conv(dilations = var_7501_dilations_0, groups = var_7501_groups_0, pad = var_7501_pad_0, pad_type = var_7501_pad_type_0, strides = var_7501_strides_0, weight = layers_18_mlp_up_proj_weight_to_fp16_palettized, x = input_147_cast_fp16)[name = string("op_7501_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_151_cast_fp16 = mul(x = var_7495_cast_fp16, y = var_7501_cast_fp16)[name = string("input_151_cast_fp16")];
+            string hidden_states_189_pad_type_0 = const()[name = string("hidden_states_189_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_189_strides_0 = const()[name = string("hidden_states_189_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_189_pad_0 = const()[name = string("hidden_states_189_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_189_dilations_0 = const()[name = string("hidden_states_189_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_189_groups_0 = const()[name = string("hidden_states_189_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_18_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(944046656))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(956629632))))[name = string("layers_18_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_189_cast_fp16 = conv(dilations = hidden_states_189_dilations_0, groups = hidden_states_189_groups_0, pad = hidden_states_189_pad_0, pad_type = hidden_states_189_pad_type_0, strides = hidden_states_189_strides_0, weight = layers_18_mlp_down_proj_weight_to_fp16_palettized, x = input_151_cast_fp16)[name = string("hidden_states_189_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_151_cast_fp16 = add(x = inputs_149_cast_fp16, y = hidden_states_189_cast_fp16)[name = string("inputs_151_cast_fp16")];
+            int32 var_7515 = const()[name = string("op_7515"), val = int32(3)];
+            int32 var_7525 = const()[name = string("op_7525"), val = int32(-2)];
+            int32 var_7533 = const()[name = string("op_7533"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_153_cast_fp16 = mul(x = inputs_151_cast_fp16, y = inputs_151_cast_fp16)[name = string("inputs_sq_153_cast_fp16")];
+            tensor<int32, [1]> variance_153_axes_0 = const()[name = string("variance_153_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_153_keep_dims_0 = const()[name = string("variance_153_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_153_cast_fp16 = reduce_mean(axes = variance_153_axes_0, keep_dims = variance_153_keep_dims_0, x = inputs_sq_153_cast_fp16)[name = string("variance_153_cast_fp16")];
+            fp16 var_7545_to_fp16 = const()[name = string("op_7545_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7546_cast_fp16 = add(x = variance_153_cast_fp16, y = var_7545_to_fp16)[name = string("op_7546_cast_fp16")];
+            fp32 var_7547_epsilon_0 = const()[name = string("op_7547_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7547_cast_fp16 = rsqrt(epsilon = var_7547_epsilon_0, x = var_7546_cast_fp16)[name = string("op_7547_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_191_cast_fp16 = mul(x = inputs_151_cast_fp16, y = var_7547_cast_fp16)[name = string("hidden_states_191_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_153_to_fp16 = const()[name = string("w_153_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(956630208)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_157_cast_fp16 = mul(x = w_153_to_fp16, y = hidden_states_191_cast_fp16)[name = string("obj_157_cast_fp16")];
+            string query_115_pad_type_0 = const()[name = string("query_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_115_strides_0 = const()[name = string("query_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_115_pad_0 = const()[name = string("query_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_115_dilations_0 = const()[name = string("query_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_115_groups_0 = const()[name = string("query_115_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_19_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(956634368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(960828736))))[name = string("layers_19_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_115_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_115_dilations_0, groups = query_115_groups_0, pad = query_115_pad_0, pad_type = query_115_pad_type_0, strides = query_115_strides_0, weight = layers_19_self_attn_q_proj_weight_to_fp16_palettized, x = obj_157_cast_fp16)[name = string("query_115_cast_fp16")];
+            string current_key_77_pad_type_0 = const()[name = string("current_key_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_77_strides_0 = const()[name = string("current_key_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_77_pad_0 = const()[name = string("current_key_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_77_dilations_0 = const()[name = string("current_key_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_77_groups_0 = const()[name = string("current_key_77_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_19_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(960829312))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(962926528))))[name = string("layers_19_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_77_cast_fp16 = conv(dilations = current_key_77_dilations_0, groups = current_key_77_groups_0, pad = current_key_77_pad_0, pad_type = current_key_77_pad_type_0, strides = current_key_77_strides_0, weight = layers_19_self_attn_k_proj_weight_to_fp16_palettized, x = obj_157_cast_fp16)[name = string("current_key_77_cast_fp16")];
+            string current_value_39_pad_type_0 = const()[name = string("current_value_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_39_strides_0 = const()[name = string("current_value_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_39_pad_0 = const()[name = string("current_value_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_39_dilations_0 = const()[name = string("current_value_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_39_groups_0 = const()[name = string("current_value_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_19_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(962927104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(965024320))))[name = string("layers_19_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_39_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_39_dilations_0, groups = current_value_39_groups_0, pad = current_value_39_pad_0, pad_type = current_value_39_pad_type_0, strides = current_value_39_strides_0, weight = layers_19_self_attn_v_proj_weight_to_fp16_palettized, x = obj_157_cast_fp16)[name = string("current_value_39_cast_fp16")];
+            tensor<int32, [4]> var_7584 = const()[name = string("op_7584"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_153_cast_fp16 = reshape(shape = var_7584, x = query_115_cast_fp16)[name = string("inputs_153_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_155_cast_fp16 = mul(x = inputs_153_cast_fp16, y = inputs_153_cast_fp16)[name = string("inputs_sq_155_cast_fp16")];
+            tensor<int32, [1]> variance_155_axes_0 = const()[name = string("variance_155_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_155_keep_dims_0 = const()[name = string("variance_155_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_155_cast_fp16 = reduce_mean(axes = variance_155_axes_0, keep_dims = variance_155_keep_dims_0, x = inputs_sq_155_cast_fp16)[name = string("variance_155_cast_fp16")];
+            fp16 var_7590_to_fp16 = const()[name = string("op_7590_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_7591_cast_fp16 = add(x = variance_155_cast_fp16, y = var_7590_to_fp16)[name = string("op_7591_cast_fp16")];
+            fp32 var_7592_epsilon_0 = const()[name = string("op_7592_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_7592_cast_fp16 = rsqrt(epsilon = var_7592_epsilon_0, x = var_7591_cast_fp16)[name = string("op_7592_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_193_cast_fp16 = mul(x = inputs_153_cast_fp16, y = var_7592_cast_fp16)[name = string("hidden_states_193_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_155_to_fp16 = const()[name = string("w_155_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(965024896)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_39_cast_fp16 = mul(x = w_155_to_fp16, y = hidden_states_193_cast_fp16)[name = string("query_normed_39_cast_fp16")];
+            tensor<int32, [4]> var_7600 = const()[name = string("op_7600"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_155_cast_fp16 = reshape(shape = var_7600, x = current_key_77_cast_fp16)[name = string("inputs_155_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_157_cast_fp16 = mul(x = inputs_155_cast_fp16, y = inputs_155_cast_fp16)[name = string("inputs_sq_157_cast_fp16")];
+            tensor<int32, [1]> variance_157_axes_0 = const()[name = string("variance_157_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_157_keep_dims_0 = const()[name = string("variance_157_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_157_cast_fp16 = reduce_mean(axes = variance_157_axes_0, keep_dims = variance_157_keep_dims_0, x = inputs_sq_157_cast_fp16)[name = string("variance_157_cast_fp16")];
+            fp16 var_7606_to_fp16 = const()[name = string("op_7606_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_7607_cast_fp16 = add(x = variance_157_cast_fp16, y = var_7606_to_fp16)[name = string("op_7607_cast_fp16")];
+            fp32 var_7608_epsilon_0 = const()[name = string("op_7608_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_7608_cast_fp16 = rsqrt(epsilon = var_7608_epsilon_0, x = var_7607_cast_fp16)[name = string("op_7608_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_195_cast_fp16 = mul(x = inputs_155_cast_fp16, y = var_7608_cast_fp16)[name = string("hidden_states_195_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_157_to_fp16 = const()[name = string("w_157_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(965025216)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_39_cast_fp16 = mul(x = w_157_to_fp16, y = hidden_states_195_cast_fp16)[name = string("current_key_normed_39_cast_fp16")];
+            tensor<int32, [4]> var_7626 = const()[name = string("op_7626"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_115_cast_fp16 = reshape(shape = var_7626, x = query_normed_39_cast_fp16)[name = string("mh_q_115_cast_fp16")];
+            tensor<int32, [4]> var_7628 = const()[name = string("op_7628"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_77_cast_fp16 = reshape(shape = var_7628, x = current_key_normed_39_cast_fp16)[name = string("mh_k_77_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7632_cast_fp16 = mul(x = mh_q_115_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7632_cast_fp16")];
+            tensor<int32, [4]> var_7637_begin_0 = const()[name = string("op_7637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7637_end_0 = const()[name = string("op_7637_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_7637_end_mask_0 = const()[name = string("op_7637_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7637_cast_fp16 = slice_by_index(begin = var_7637_begin_0, end = var_7637_end_0, end_mask = var_7637_end_mask_0, x = mh_q_115_cast_fp16)[name = string("op_7637_cast_fp16")];
+            tensor<int32, [4]> var_7643_begin_0 = const()[name = string("op_7643_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7643_end_0 = const()[name = string("op_7643_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_7643_end_mask_0 = const()[name = string("op_7643_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_7643_cast_fp16 = slice_by_index(begin = var_7643_begin_0, end = var_7643_end_0, end_mask = var_7643_end_mask_0, x = mh_q_115_cast_fp16)[name = string("op_7643_cast_fp16")];
+            fp16 const_454_promoted_to_fp16 = const()[name = string("const_454_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_7645_cast_fp16 = mul(x = var_7643_cast_fp16, y = const_454_promoted_to_fp16)[name = string("op_7645_cast_fp16")];
+            bool var_7647_interleave_0 = const()[name = string("op_7647_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_7647_cast_fp16 = concat(axis = var_7525, interleave = var_7647_interleave_0, values = (var_7645_cast_fp16, var_7637_cast_fp16))[name = string("op_7647_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_7648_cast_fp16 = mul(x = var_7647_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7648_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_117_cast_fp16 = add(x = var_7632_cast_fp16, y = var_7648_cast_fp16)[name = string("mh_q_117_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7650_cast_fp16 = mul(x = mh_k_77_cast_fp16, y = cos_1_cast_fp16)[name = string("op_7650_cast_fp16")];
+            tensor<int32, [4]> var_7655_begin_0 = const()[name = string("op_7655_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7655_end_0 = const()[name = string("op_7655_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_7655_end_mask_0 = const()[name = string("op_7655_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7655_cast_fp16 = slice_by_index(begin = var_7655_begin_0, end = var_7655_end_0, end_mask = var_7655_end_mask_0, x = mh_k_77_cast_fp16)[name = string("op_7655_cast_fp16")];
+            tensor<int32, [4]> var_7661_begin_0 = const()[name = string("op_7661_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_7661_end_0 = const()[name = string("op_7661_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_7661_end_mask_0 = const()[name = string("op_7661_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_7661_cast_fp16 = slice_by_index(begin = var_7661_begin_0, end = var_7661_end_0, end_mask = var_7661_end_mask_0, x = mh_k_77_cast_fp16)[name = string("op_7661_cast_fp16")];
+            fp16 const_457_promoted_to_fp16 = const()[name = string("const_457_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_7663_cast_fp16 = mul(x = var_7661_cast_fp16, y = const_457_promoted_to_fp16)[name = string("op_7663_cast_fp16")];
+            bool var_7665_interleave_0 = const()[name = string("op_7665_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_7665_cast_fp16 = concat(axis = var_7525, interleave = var_7665_interleave_0, values = (var_7663_cast_fp16, var_7655_cast_fp16))[name = string("op_7665_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_7666_cast_fp16 = mul(x = var_7665_cast_fp16, y = sin_1_cast_fp16)[name = string("op_7666_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_79_cast_fp16 = add(x = var_7650_cast_fp16, y = var_7666_cast_fp16)[name = string("mh_k_79_cast_fp16")];
+            tensor<int32, [4]> var_7670 = const()[name = string("op_7670"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_79_cast_fp16 = reshape(shape = var_7670, x = mh_k_79_cast_fp16)[name = string("current_key_79_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7677_cast_fp16 = mul(x = var_101_cast_fp16_19, y = var_323_cast_fp16)[name = string("op_7677_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7678_cast_fp16 = mul(x = current_key_79_cast_fp16, y = var_321_cast_fp16)[name = string("op_7678_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_117_cast_fp16 = add(x = var_7677_cast_fp16, y = var_7678_cast_fp16)[name = string("key_117_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7681_cast_fp16 = mul(x = var_132_cast_fp16_19, y = var_323_cast_fp16)[name = string("op_7681_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_7682_cast_fp16 = mul(x = current_value_39_cast_fp16, y = var_321_cast_fp16)[name = string("op_7682_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_77_cast_fp16 = add(x = var_7681_cast_fp16, y = var_7682_cast_fp16)[name = string("value_77_cast_fp16")];
+            tensor<int32, [4]> var_7686 = const()[name = string("op_7686"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_77_cast_fp16 = reshape(shape = var_7686, x = key_117_cast_fp16)[name = string("key_heads_77_cast_fp16")];
+            tensor<int32, [4]> var_7688 = const()[name = string("op_7688"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_77_cast_fp16 = reshape(shape = var_7688, x = value_77_cast_fp16)[name = string("value_heads_77_cast_fp16")];
+            tensor<int32, [4]> var_7691_begin_0 = const()[name = string("op_7691_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7691_end_0 = const()[name = string("op_7691_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7691_end_mask_0 = const()[name = string("op_7691_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7691_cast_fp16 = slice_by_index(begin = var_7691_begin_0, end = var_7691_end_0, end_mask = var_7691_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7691_cast_fp16")];
+            tensor<int32, [4]> var_7695_begin_0 = const()[name = string("op_7695_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7695_end_0 = const()[name = string("op_7695_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7695_end_mask_0 = const()[name = string("op_7695_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7695_cast_fp16 = slice_by_index(begin = var_7695_begin_0, end = var_7695_end_0, end_mask = var_7695_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7695_cast_fp16")];
+            tensor<int32, [4]> var_7707_begin_0 = const()[name = string("op_7707_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7707_end_0 = const()[name = string("op_7707_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7707_end_mask_0 = const()[name = string("op_7707_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7707_cast_fp16 = slice_by_index(begin = var_7707_begin_0, end = var_7707_end_0, end_mask = var_7707_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7707_cast_fp16")];
+            tensor<int32, [4]> var_7711_begin_0 = const()[name = string("op_7711_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_7711_end_0 = const()[name = string("op_7711_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_7711_end_mask_0 = const()[name = string("op_7711_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7711_cast_fp16 = slice_by_index(begin = var_7711_begin_0, end = var_7711_end_0, end_mask = var_7711_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7711_cast_fp16")];
+            tensor<int32, [4]> var_7723_begin_0 = const()[name = string("op_7723_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7723_end_0 = const()[name = string("op_7723_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7723_end_mask_0 = const()[name = string("op_7723_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7723_cast_fp16 = slice_by_index(begin = var_7723_begin_0, end = var_7723_end_0, end_mask = var_7723_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7723_cast_fp16")];
+            tensor<int32, [4]> var_7727_begin_0 = const()[name = string("op_7727_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_7727_end_0 = const()[name = string("op_7727_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_7727_end_mask_0 = const()[name = string("op_7727_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7727_cast_fp16 = slice_by_index(begin = var_7727_begin_0, end = var_7727_end_0, end_mask = var_7727_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7727_cast_fp16")];
+            tensor<int32, [4]> var_7739_begin_0 = const()[name = string("op_7739_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7739_end_0 = const()[name = string("op_7739_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7739_end_mask_0 = const()[name = string("op_7739_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7739_cast_fp16 = slice_by_index(begin = var_7739_begin_0, end = var_7739_end_0, end_mask = var_7739_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7739_cast_fp16")];
+            tensor<int32, [4]> var_7743_begin_0 = const()[name = string("op_7743_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_7743_end_0 = const()[name = string("op_7743_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_7743_end_mask_0 = const()[name = string("op_7743_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7743_cast_fp16 = slice_by_index(begin = var_7743_begin_0, end = var_7743_end_0, end_mask = var_7743_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7743_cast_fp16")];
+            tensor<int32, [4]> var_7755_begin_0 = const()[name = string("op_7755_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7755_end_0 = const()[name = string("op_7755_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7755_end_mask_0 = const()[name = string("op_7755_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7755_cast_fp16 = slice_by_index(begin = var_7755_begin_0, end = var_7755_end_0, end_mask = var_7755_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7755_cast_fp16")];
+            tensor<int32, [4]> var_7759_begin_0 = const()[name = string("op_7759_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_7759_end_0 = const()[name = string("op_7759_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_7759_end_mask_0 = const()[name = string("op_7759_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7759_cast_fp16 = slice_by_index(begin = var_7759_begin_0, end = var_7759_end_0, end_mask = var_7759_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7759_cast_fp16")];
+            tensor<int32, [4]> var_7771_begin_0 = const()[name = string("op_7771_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7771_end_0 = const()[name = string("op_7771_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7771_end_mask_0 = const()[name = string("op_7771_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7771_cast_fp16 = slice_by_index(begin = var_7771_begin_0, end = var_7771_end_0, end_mask = var_7771_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7771_cast_fp16")];
+            tensor<int32, [4]> var_7775_begin_0 = const()[name = string("op_7775_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_7775_end_0 = const()[name = string("op_7775_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_7775_end_mask_0 = const()[name = string("op_7775_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7775_cast_fp16 = slice_by_index(begin = var_7775_begin_0, end = var_7775_end_0, end_mask = var_7775_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7775_cast_fp16")];
+            tensor<int32, [4]> var_7787_begin_0 = const()[name = string("op_7787_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7787_end_0 = const()[name = string("op_7787_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7787_end_mask_0 = const()[name = string("op_7787_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7787_cast_fp16 = slice_by_index(begin = var_7787_begin_0, end = var_7787_end_0, end_mask = var_7787_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7787_cast_fp16")];
+            tensor<int32, [4]> var_7791_begin_0 = const()[name = string("op_7791_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_7791_end_0 = const()[name = string("op_7791_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_7791_end_mask_0 = const()[name = string("op_7791_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7791_cast_fp16 = slice_by_index(begin = var_7791_begin_0, end = var_7791_end_0, end_mask = var_7791_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7791_cast_fp16")];
+            tensor<int32, [4]> var_7803_begin_0 = const()[name = string("op_7803_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7803_end_0 = const()[name = string("op_7803_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7803_end_mask_0 = const()[name = string("op_7803_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7803_cast_fp16 = slice_by_index(begin = var_7803_begin_0, end = var_7803_end_0, end_mask = var_7803_end_mask_0, x = key_heads_77_cast_fp16)[name = string("op_7803_cast_fp16")];
+            tensor<int32, [4]> var_7807_begin_0 = const()[name = string("op_7807_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_7807_end_0 = const()[name = string("op_7807_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_7807_end_mask_0 = const()[name = string("op_7807_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_7807_cast_fp16 = slice_by_index(begin = var_7807_begin_0, end = var_7807_end_0, end_mask = var_7807_end_mask_0, x = value_heads_77_cast_fp16)[name = string("op_7807_cast_fp16")];
+            bool key_heads_79_interleave_0 = const()[name = string("key_heads_79_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_79_cast_fp16 = concat(axis = var_7533, interleave = key_heads_79_interleave_0, values = (var_7691_cast_fp16, var_7691_cast_fp16, var_7707_cast_fp16, var_7707_cast_fp16, var_7723_cast_fp16, var_7723_cast_fp16, var_7739_cast_fp16, var_7739_cast_fp16, var_7755_cast_fp16, var_7755_cast_fp16, var_7771_cast_fp16, var_7771_cast_fp16, var_7787_cast_fp16, var_7787_cast_fp16, var_7803_cast_fp16, var_7803_cast_fp16))[name = string("key_heads_79_cast_fp16")];
+            bool value_heads_79_interleave_0 = const()[name = string("value_heads_79_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_79_cast_fp16 = concat(axis = var_7533, interleave = value_heads_79_interleave_0, values = (var_7695_cast_fp16, var_7695_cast_fp16, var_7711_cast_fp16, var_7711_cast_fp16, var_7727_cast_fp16, var_7727_cast_fp16, var_7743_cast_fp16, var_7743_cast_fp16, var_7759_cast_fp16, var_7759_cast_fp16, var_7775_cast_fp16, var_7775_cast_fp16, var_7791_cast_fp16, var_7791_cast_fp16, var_7807_cast_fp16, var_7807_cast_fp16))[name = string("value_heads_79_cast_fp16")];
+            fp16 var_7830_to_fp16 = const()[name = string("op_7830_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_7831_cast_fp16 = mul(x = mh_q_117_cast_fp16, y = var_7830_to_fp16)[name = string("op_7831_cast_fp16")];
+            bool mh_w_77_transpose_x_0 = const()[name = string("mh_w_77_transpose_x_0"), val = bool(true)];
+            bool mh_w_77_transpose_y_0 = const()[name = string("mh_w_77_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_77_cast_fp16 = matmul(transpose_x = mh_w_77_transpose_x_0, transpose_y = mh_w_77_transpose_y_0, x = var_7831_cast_fp16, y = key_heads_79_cast_fp16)[name = string("mh_w_77_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_79_cast_fp16 = add(x = mh_w_77_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_79_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_7843_cast_fp16 = softmax(axis = var_7515, x = mh_w_79_cast_fp16)[name = string("op_7843_cast_fp16")];
+            bool attn_39_transpose_x_0 = const()[name = string("attn_39_transpose_x_0"), val = bool(false)];
+            bool attn_39_transpose_y_0 = const()[name = string("attn_39_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_39_cast_fp16 = matmul(transpose_x = attn_39_transpose_x_0, transpose_y = attn_39_transpose_y_0, x = value_heads_79_cast_fp16, y = var_7843_cast_fp16)[name = string("attn_39_cast_fp16")];
+            tensor<int32, [4]> var_7848 = const()[name = string("op_7848"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_153_cast_fp16 = reshape(shape = var_7848, x = attn_39_cast_fp16)[name = string("input_153_cast_fp16")];
+            string obj_163_pad_type_0 = const()[name = string("obj_163_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_163_strides_0 = const()[name = string("obj_163_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_163_pad_0 = const()[name = string("obj_163_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_163_dilations_0 = const()[name = string("obj_163_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_163_groups_0 = const()[name = string("obj_163_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_19_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(965025536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(969219904))))[name = string("layers_19_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_163_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_163_dilations_0, groups = obj_163_groups_0, pad = obj_163_pad_0, pad_type = obj_163_pad_type_0, strides = obj_163_strides_0, weight = layers_19_self_attn_o_proj_weight_to_fp16_palettized, x = input_153_cast_fp16)[name = string("obj_163_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_157_cast_fp16 = add(x = inputs_151_cast_fp16, y = obj_163_cast_fp16)[name = string("inputs_157_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_159_cast_fp16 = mul(x = inputs_157_cast_fp16, y = inputs_157_cast_fp16)[name = string("inputs_sq_159_cast_fp16")];
+            tensor<int32, [1]> variance_159_axes_0 = const()[name = string("variance_159_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_159_keep_dims_0 = const()[name = string("variance_159_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_159_cast_fp16 = reduce_mean(axes = variance_159_axes_0, keep_dims = variance_159_keep_dims_0, x = inputs_sq_159_cast_fp16)[name = string("variance_159_cast_fp16")];
+            fp16 var_7866_to_fp16 = const()[name = string("op_7866_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7867_cast_fp16 = add(x = variance_159_cast_fp16, y = var_7866_to_fp16)[name = string("op_7867_cast_fp16")];
+            fp32 var_7868_epsilon_0 = const()[name = string("op_7868_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7868_cast_fp16 = rsqrt(epsilon = var_7868_epsilon_0, x = var_7867_cast_fp16)[name = string("op_7868_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_197_cast_fp16 = mul(x = inputs_157_cast_fp16, y = var_7868_cast_fp16)[name = string("hidden_states_197_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_159_to_fp16 = const()[name = string("w_159_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(969220480)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_155_cast_fp16 = mul(x = w_159_to_fp16, y = hidden_states_197_cast_fp16)[name = string("input_155_cast_fp16")];
+            string input_157_pad_type_0 = const()[name = string("input_157_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_157_strides_0 = const()[name = string("input_157_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_157_pad_0 = const()[name = string("input_157_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_157_dilations_0 = const()[name = string("input_157_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_157_groups_0 = const()[name = string("input_157_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_19_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(969224640))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981807616))))[name = string("layers_19_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_157_cast_fp16 = conv(dilations = input_157_dilations_0, groups = input_157_groups_0, pad = input_157_pad_0, pad_type = input_157_pad_type_0, strides = input_157_strides_0, weight = layers_19_mlp_gate_proj_weight_to_fp16_palettized, x = input_155_cast_fp16)[name = string("input_157_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_7882_cast_fp16 = silu(x = input_157_cast_fp16)[name = string("op_7882_cast_fp16")];
+            string var_7888_pad_type_0 = const()[name = string("op_7888_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7888_strides_0 = const()[name = string("op_7888_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7888_pad_0 = const()[name = string("op_7888_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7888_dilations_0 = const()[name = string("op_7888_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7888_groups_0 = const()[name = string("op_7888_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_19_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981808192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(994391168))))[name = string("layers_19_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_7888_cast_fp16 = conv(dilations = var_7888_dilations_0, groups = var_7888_groups_0, pad = var_7888_pad_0, pad_type = var_7888_pad_type_0, strides = var_7888_strides_0, weight = layers_19_mlp_up_proj_weight_to_fp16_palettized, x = input_155_cast_fp16)[name = string("op_7888_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_159_cast_fp16 = mul(x = var_7882_cast_fp16, y = var_7888_cast_fp16)[name = string("input_159_cast_fp16")];
+            string hidden_states_199_pad_type_0 = const()[name = string("hidden_states_199_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_199_strides_0 = const()[name = string("hidden_states_199_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_199_pad_0 = const()[name = string("hidden_states_199_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_199_dilations_0 = const()[name = string("hidden_states_199_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_199_groups_0 = const()[name = string("hidden_states_199_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_19_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(994391744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1006974720))))[name = string("layers_19_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_199_cast_fp16 = conv(dilations = hidden_states_199_dilations_0, groups = hidden_states_199_groups_0, pad = hidden_states_199_pad_0, pad_type = hidden_states_199_pad_type_0, strides = hidden_states_199_strides_0, weight = layers_19_mlp_down_proj_weight_to_fp16_palettized, x = input_159_cast_fp16)[name = string("hidden_states_199_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_159_cast_fp16 = add(x = inputs_157_cast_fp16, y = hidden_states_199_cast_fp16)[name = string("inputs_159_cast_fp16")];
+            int32 var_7902 = const()[name = string("op_7902"), val = int32(3)];
+            int32 var_7912 = const()[name = string("op_7912"), val = int32(-2)];
+            int32 var_7920 = const()[name = string("op_7920"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_161_cast_fp16 = mul(x = inputs_159_cast_fp16, y = inputs_159_cast_fp16)[name = string("inputs_sq_161_cast_fp16")];
+            tensor<int32, [1]> variance_161_axes_0 = const()[name = string("variance_161_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_161_keep_dims_0 = const()[name = string("variance_161_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_161_cast_fp16 = reduce_mean(axes = variance_161_axes_0, keep_dims = variance_161_keep_dims_0, x = inputs_sq_161_cast_fp16)[name = string("variance_161_cast_fp16")];
+            fp16 var_7932_to_fp16 = const()[name = string("op_7932_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_7933_cast_fp16 = add(x = variance_161_cast_fp16, y = var_7932_to_fp16)[name = string("op_7933_cast_fp16")];
+            fp32 var_7934_epsilon_0 = const()[name = string("op_7934_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_7934_cast_fp16 = rsqrt(epsilon = var_7934_epsilon_0, x = var_7933_cast_fp16)[name = string("op_7934_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_201_cast_fp16 = mul(x = inputs_159_cast_fp16, y = var_7934_cast_fp16)[name = string("hidden_states_201_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_161_to_fp16 = const()[name = string("w_161_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1006975296)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_165_cast_fp16 = mul(x = w_161_to_fp16, y = hidden_states_201_cast_fp16)[name = string("obj_165_cast_fp16")];
+            string query_121_pad_type_0 = const()[name = string("query_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_121_strides_0 = const()[name = string("query_121_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_121_pad_0 = const()[name = string("query_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_121_dilations_0 = const()[name = string("query_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_121_groups_0 = const()[name = string("query_121_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_20_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1006979456))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1011173824))))[name = string("layers_20_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_121_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_121_dilations_0, groups = query_121_groups_0, pad = query_121_pad_0, pad_type = query_121_pad_type_0, strides = query_121_strides_0, weight = layers_20_self_attn_q_proj_weight_to_fp16_palettized, x = obj_165_cast_fp16)[name = string("query_121_cast_fp16")];
+            string current_key_81_pad_type_0 = const()[name = string("current_key_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_81_strides_0 = const()[name = string("current_key_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_81_pad_0 = const()[name = string("current_key_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_81_dilations_0 = const()[name = string("current_key_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_81_groups_0 = const()[name = string("current_key_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_20_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1011174400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1013271616))))[name = string("layers_20_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_81_cast_fp16 = conv(dilations = current_key_81_dilations_0, groups = current_key_81_groups_0, pad = current_key_81_pad_0, pad_type = current_key_81_pad_type_0, strides = current_key_81_strides_0, weight = layers_20_self_attn_k_proj_weight_to_fp16_palettized, x = obj_165_cast_fp16)[name = string("current_key_81_cast_fp16")];
+            string current_value_41_pad_type_0 = const()[name = string("current_value_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_41_strides_0 = const()[name = string("current_value_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_41_pad_0 = const()[name = string("current_value_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_41_dilations_0 = const()[name = string("current_value_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_41_groups_0 = const()[name = string("current_value_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_20_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1013272192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1015369408))))[name = string("layers_20_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_41_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_41_dilations_0, groups = current_value_41_groups_0, pad = current_value_41_pad_0, pad_type = current_value_41_pad_type_0, strides = current_value_41_strides_0, weight = layers_20_self_attn_v_proj_weight_to_fp16_palettized, x = obj_165_cast_fp16)[name = string("current_value_41_cast_fp16")];
+            tensor<int32, [4]> var_7971 = const()[name = string("op_7971"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_161_cast_fp16 = reshape(shape = var_7971, x = query_121_cast_fp16)[name = string("inputs_161_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_163_cast_fp16 = mul(x = inputs_161_cast_fp16, y = inputs_161_cast_fp16)[name = string("inputs_sq_163_cast_fp16")];
+            tensor<int32, [1]> variance_163_axes_0 = const()[name = string("variance_163_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_163_keep_dims_0 = const()[name = string("variance_163_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_163_cast_fp16 = reduce_mean(axes = variance_163_axes_0, keep_dims = variance_163_keep_dims_0, x = inputs_sq_163_cast_fp16)[name = string("variance_163_cast_fp16")];
+            fp16 var_7977_to_fp16 = const()[name = string("op_7977_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_7978_cast_fp16 = add(x = variance_163_cast_fp16, y = var_7977_to_fp16)[name = string("op_7978_cast_fp16")];
+            fp32 var_7979_epsilon_0 = const()[name = string("op_7979_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_7979_cast_fp16 = rsqrt(epsilon = var_7979_epsilon_0, x = var_7978_cast_fp16)[name = string("op_7979_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_203_cast_fp16 = mul(x = inputs_161_cast_fp16, y = var_7979_cast_fp16)[name = string("hidden_states_203_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_163_to_fp16 = const()[name = string("w_163_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1015369984)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_41_cast_fp16 = mul(x = w_163_to_fp16, y = hidden_states_203_cast_fp16)[name = string("query_normed_41_cast_fp16")];
+            tensor<int32, [4]> var_7987 = const()[name = string("op_7987"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_163_cast_fp16 = reshape(shape = var_7987, x = current_key_81_cast_fp16)[name = string("inputs_163_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_165_cast_fp16 = mul(x = inputs_163_cast_fp16, y = inputs_163_cast_fp16)[name = string("inputs_sq_165_cast_fp16")];
+            tensor<int32, [1]> variance_165_axes_0 = const()[name = string("variance_165_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_165_keep_dims_0 = const()[name = string("variance_165_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_165_cast_fp16 = reduce_mean(axes = variance_165_axes_0, keep_dims = variance_165_keep_dims_0, x = inputs_sq_165_cast_fp16)[name = string("variance_165_cast_fp16")];
+            fp16 var_7993_to_fp16 = const()[name = string("op_7993_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_7994_cast_fp16 = add(x = variance_165_cast_fp16, y = var_7993_to_fp16)[name = string("op_7994_cast_fp16")];
+            fp32 var_7995_epsilon_0 = const()[name = string("op_7995_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_7995_cast_fp16 = rsqrt(epsilon = var_7995_epsilon_0, x = var_7994_cast_fp16)[name = string("op_7995_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_205_cast_fp16 = mul(x = inputs_163_cast_fp16, y = var_7995_cast_fp16)[name = string("hidden_states_205_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_165_to_fp16 = const()[name = string("w_165_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1015370304)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_41_cast_fp16 = mul(x = w_165_to_fp16, y = hidden_states_205_cast_fp16)[name = string("current_key_normed_41_cast_fp16")];
+            tensor<int32, [4]> var_8013 = const()[name = string("op_8013"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_121_cast_fp16 = reshape(shape = var_8013, x = query_normed_41_cast_fp16)[name = string("mh_q_121_cast_fp16")];
+            tensor<int32, [4]> var_8015 = const()[name = string("op_8015"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_81_cast_fp16 = reshape(shape = var_8015, x = current_key_normed_41_cast_fp16)[name = string("mh_k_81_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8019_cast_fp16 = mul(x = mh_q_121_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8019_cast_fp16")];
+            tensor<int32, [4]> var_8024_begin_0 = const()[name = string("op_8024_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8024_end_0 = const()[name = string("op_8024_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_8024_end_mask_0 = const()[name = string("op_8024_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8024_cast_fp16 = slice_by_index(begin = var_8024_begin_0, end = var_8024_end_0, end_mask = var_8024_end_mask_0, x = mh_q_121_cast_fp16)[name = string("op_8024_cast_fp16")];
+            tensor<int32, [4]> var_8030_begin_0 = const()[name = string("op_8030_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8030_end_0 = const()[name = string("op_8030_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_8030_end_mask_0 = const()[name = string("op_8030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8030_cast_fp16 = slice_by_index(begin = var_8030_begin_0, end = var_8030_end_0, end_mask = var_8030_end_mask_0, x = mh_q_121_cast_fp16)[name = string("op_8030_cast_fp16")];
+            fp16 const_477_promoted_to_fp16 = const()[name = string("const_477_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_8032_cast_fp16 = mul(x = var_8030_cast_fp16, y = const_477_promoted_to_fp16)[name = string("op_8032_cast_fp16")];
+            bool var_8034_interleave_0 = const()[name = string("op_8034_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_8034_cast_fp16 = concat(axis = var_7912, interleave = var_8034_interleave_0, values = (var_8032_cast_fp16, var_8024_cast_fp16))[name = string("op_8034_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8035_cast_fp16 = mul(x = var_8034_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8035_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_123_cast_fp16 = add(x = var_8019_cast_fp16, y = var_8035_cast_fp16)[name = string("mh_q_123_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8037_cast_fp16 = mul(x = mh_k_81_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8037_cast_fp16")];
+            tensor<int32, [4]> var_8042_begin_0 = const()[name = string("op_8042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8042_end_0 = const()[name = string("op_8042_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_8042_end_mask_0 = const()[name = string("op_8042_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8042_cast_fp16 = slice_by_index(begin = var_8042_begin_0, end = var_8042_end_0, end_mask = var_8042_end_mask_0, x = mh_k_81_cast_fp16)[name = string("op_8042_cast_fp16")];
+            tensor<int32, [4]> var_8048_begin_0 = const()[name = string("op_8048_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8048_end_0 = const()[name = string("op_8048_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_8048_end_mask_0 = const()[name = string("op_8048_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8048_cast_fp16 = slice_by_index(begin = var_8048_begin_0, end = var_8048_end_0, end_mask = var_8048_end_mask_0, x = mh_k_81_cast_fp16)[name = string("op_8048_cast_fp16")];
+            fp16 const_480_promoted_to_fp16 = const()[name = string("const_480_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_8050_cast_fp16 = mul(x = var_8048_cast_fp16, y = const_480_promoted_to_fp16)[name = string("op_8050_cast_fp16")];
+            bool var_8052_interleave_0 = const()[name = string("op_8052_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_8052_cast_fp16 = concat(axis = var_7912, interleave = var_8052_interleave_0, values = (var_8050_cast_fp16, var_8042_cast_fp16))[name = string("op_8052_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8053_cast_fp16 = mul(x = var_8052_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8053_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_83_cast_fp16 = add(x = var_8037_cast_fp16, y = var_8053_cast_fp16)[name = string("mh_k_83_cast_fp16")];
+            tensor<int32, [4]> var_8057 = const()[name = string("op_8057"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_83_cast_fp16 = reshape(shape = var_8057, x = mh_k_83_cast_fp16)[name = string("current_key_83_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8064_cast_fp16 = mul(x = var_101_cast_fp16_20, y = var_323_cast_fp16)[name = string("op_8064_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8065_cast_fp16 = mul(x = current_key_83_cast_fp16, y = var_321_cast_fp16)[name = string("op_8065_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_123_cast_fp16 = add(x = var_8064_cast_fp16, y = var_8065_cast_fp16)[name = string("key_123_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8068_cast_fp16 = mul(x = var_132_cast_fp16_20, y = var_323_cast_fp16)[name = string("op_8068_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8069_cast_fp16 = mul(x = current_value_41_cast_fp16, y = var_321_cast_fp16)[name = string("op_8069_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_81_cast_fp16 = add(x = var_8068_cast_fp16, y = var_8069_cast_fp16)[name = string("value_81_cast_fp16")];
+            tensor<int32, [4]> var_8073 = const()[name = string("op_8073"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_81_cast_fp16 = reshape(shape = var_8073, x = key_123_cast_fp16)[name = string("key_heads_81_cast_fp16")];
+            tensor<int32, [4]> var_8075 = const()[name = string("op_8075"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_81_cast_fp16 = reshape(shape = var_8075, x = value_81_cast_fp16)[name = string("value_heads_81_cast_fp16")];
+            tensor<int32, [4]> var_8078_begin_0 = const()[name = string("op_8078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8078_end_0 = const()[name = string("op_8078_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8078_end_mask_0 = const()[name = string("op_8078_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8078_cast_fp16 = slice_by_index(begin = var_8078_begin_0, end = var_8078_end_0, end_mask = var_8078_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8078_cast_fp16")];
+            tensor<int32, [4]> var_8082_begin_0 = const()[name = string("op_8082_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8082_end_0 = const()[name = string("op_8082_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8082_end_mask_0 = const()[name = string("op_8082_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8082_cast_fp16 = slice_by_index(begin = var_8082_begin_0, end = var_8082_end_0, end_mask = var_8082_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8082_cast_fp16")];
+            tensor<int32, [4]> var_8094_begin_0 = const()[name = string("op_8094_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8094_end_0 = const()[name = string("op_8094_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8094_end_mask_0 = const()[name = string("op_8094_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8094_cast_fp16 = slice_by_index(begin = var_8094_begin_0, end = var_8094_end_0, end_mask = var_8094_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8094_cast_fp16")];
+            tensor<int32, [4]> var_8098_begin_0 = const()[name = string("op_8098_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8098_end_0 = const()[name = string("op_8098_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8098_end_mask_0 = const()[name = string("op_8098_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8098_cast_fp16 = slice_by_index(begin = var_8098_begin_0, end = var_8098_end_0, end_mask = var_8098_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8098_cast_fp16")];
+            tensor<int32, [4]> var_8110_begin_0 = const()[name = string("op_8110_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8110_end_0 = const()[name = string("op_8110_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8110_end_mask_0 = const()[name = string("op_8110_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8110_cast_fp16 = slice_by_index(begin = var_8110_begin_0, end = var_8110_end_0, end_mask = var_8110_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8110_cast_fp16")];
+            tensor<int32, [4]> var_8114_begin_0 = const()[name = string("op_8114_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8114_end_0 = const()[name = string("op_8114_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8114_end_mask_0 = const()[name = string("op_8114_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8114_cast_fp16 = slice_by_index(begin = var_8114_begin_0, end = var_8114_end_0, end_mask = var_8114_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8114_cast_fp16")];
+            tensor<int32, [4]> var_8126_begin_0 = const()[name = string("op_8126_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8126_end_0 = const()[name = string("op_8126_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8126_end_mask_0 = const()[name = string("op_8126_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8126_cast_fp16 = slice_by_index(begin = var_8126_begin_0, end = var_8126_end_0, end_mask = var_8126_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8126_cast_fp16")];
+            tensor<int32, [4]> var_8130_begin_0 = const()[name = string("op_8130_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8130_end_0 = const()[name = string("op_8130_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8130_end_mask_0 = const()[name = string("op_8130_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8130_cast_fp16 = slice_by_index(begin = var_8130_begin_0, end = var_8130_end_0, end_mask = var_8130_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8130_cast_fp16")];
+            tensor<int32, [4]> var_8142_begin_0 = const()[name = string("op_8142_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8142_end_0 = const()[name = string("op_8142_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8142_end_mask_0 = const()[name = string("op_8142_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8142_cast_fp16 = slice_by_index(begin = var_8142_begin_0, end = var_8142_end_0, end_mask = var_8142_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8142_cast_fp16")];
+            tensor<int32, [4]> var_8146_begin_0 = const()[name = string("op_8146_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8146_end_0 = const()[name = string("op_8146_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8146_end_mask_0 = const()[name = string("op_8146_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8146_cast_fp16 = slice_by_index(begin = var_8146_begin_0, end = var_8146_end_0, end_mask = var_8146_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8146_cast_fp16")];
+            tensor<int32, [4]> var_8158_begin_0 = const()[name = string("op_8158_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8158_end_0 = const()[name = string("op_8158_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8158_end_mask_0 = const()[name = string("op_8158_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8158_cast_fp16 = slice_by_index(begin = var_8158_begin_0, end = var_8158_end_0, end_mask = var_8158_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8158_cast_fp16")];
+            tensor<int32, [4]> var_8162_begin_0 = const()[name = string("op_8162_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8162_end_0 = const()[name = string("op_8162_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8162_end_mask_0 = const()[name = string("op_8162_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8162_cast_fp16 = slice_by_index(begin = var_8162_begin_0, end = var_8162_end_0, end_mask = var_8162_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8162_cast_fp16")];
+            tensor<int32, [4]> var_8174_begin_0 = const()[name = string("op_8174_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8174_end_0 = const()[name = string("op_8174_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8174_end_mask_0 = const()[name = string("op_8174_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8174_cast_fp16 = slice_by_index(begin = var_8174_begin_0, end = var_8174_end_0, end_mask = var_8174_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8174_cast_fp16")];
+            tensor<int32, [4]> var_8178_begin_0 = const()[name = string("op_8178_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8178_end_0 = const()[name = string("op_8178_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8178_end_mask_0 = const()[name = string("op_8178_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8178_cast_fp16 = slice_by_index(begin = var_8178_begin_0, end = var_8178_end_0, end_mask = var_8178_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8178_cast_fp16")];
+            tensor<int32, [4]> var_8190_begin_0 = const()[name = string("op_8190_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8190_end_0 = const()[name = string("op_8190_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8190_end_mask_0 = const()[name = string("op_8190_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8190_cast_fp16 = slice_by_index(begin = var_8190_begin_0, end = var_8190_end_0, end_mask = var_8190_end_mask_0, x = key_heads_81_cast_fp16)[name = string("op_8190_cast_fp16")];
+            tensor<int32, [4]> var_8194_begin_0 = const()[name = string("op_8194_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8194_end_0 = const()[name = string("op_8194_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8194_end_mask_0 = const()[name = string("op_8194_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8194_cast_fp16 = slice_by_index(begin = var_8194_begin_0, end = var_8194_end_0, end_mask = var_8194_end_mask_0, x = value_heads_81_cast_fp16)[name = string("op_8194_cast_fp16")];
+            bool key_heads_83_interleave_0 = const()[name = string("key_heads_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_83_cast_fp16 = concat(axis = var_7920, interleave = key_heads_83_interleave_0, values = (var_8078_cast_fp16, var_8078_cast_fp16, var_8094_cast_fp16, var_8094_cast_fp16, var_8110_cast_fp16, var_8110_cast_fp16, var_8126_cast_fp16, var_8126_cast_fp16, var_8142_cast_fp16, var_8142_cast_fp16, var_8158_cast_fp16, var_8158_cast_fp16, var_8174_cast_fp16, var_8174_cast_fp16, var_8190_cast_fp16, var_8190_cast_fp16))[name = string("key_heads_83_cast_fp16")];
+            bool value_heads_83_interleave_0 = const()[name = string("value_heads_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_83_cast_fp16 = concat(axis = var_7920, interleave = value_heads_83_interleave_0, values = (var_8082_cast_fp16, var_8082_cast_fp16, var_8098_cast_fp16, var_8098_cast_fp16, var_8114_cast_fp16, var_8114_cast_fp16, var_8130_cast_fp16, var_8130_cast_fp16, var_8146_cast_fp16, var_8146_cast_fp16, var_8162_cast_fp16, var_8162_cast_fp16, var_8178_cast_fp16, var_8178_cast_fp16, var_8194_cast_fp16, var_8194_cast_fp16))[name = string("value_heads_83_cast_fp16")];
+            fp16 var_8217_to_fp16 = const()[name = string("op_8217_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_8218_cast_fp16 = mul(x = mh_q_123_cast_fp16, y = var_8217_to_fp16)[name = string("op_8218_cast_fp16")];
+            bool mh_w_81_transpose_x_0 = const()[name = string("mh_w_81_transpose_x_0"), val = bool(true)];
+            bool mh_w_81_transpose_y_0 = const()[name = string("mh_w_81_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_81_cast_fp16 = matmul(transpose_x = mh_w_81_transpose_x_0, transpose_y = mh_w_81_transpose_y_0, x = var_8218_cast_fp16, y = key_heads_83_cast_fp16)[name = string("mh_w_81_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_83_cast_fp16 = add(x = mh_w_81_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_83_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_8230_cast_fp16 = softmax(axis = var_7902, x = mh_w_83_cast_fp16)[name = string("op_8230_cast_fp16")];
+            bool attn_41_transpose_x_0 = const()[name = string("attn_41_transpose_x_0"), val = bool(false)];
+            bool attn_41_transpose_y_0 = const()[name = string("attn_41_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_41_cast_fp16 = matmul(transpose_x = attn_41_transpose_x_0, transpose_y = attn_41_transpose_y_0, x = value_heads_83_cast_fp16, y = var_8230_cast_fp16)[name = string("attn_41_cast_fp16")];
+            tensor<int32, [4]> var_8235 = const()[name = string("op_8235"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_161_cast_fp16 = reshape(shape = var_8235, x = attn_41_cast_fp16)[name = string("input_161_cast_fp16")];
+            string obj_171_pad_type_0 = const()[name = string("obj_171_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_171_strides_0 = const()[name = string("obj_171_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_171_pad_0 = const()[name = string("obj_171_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_171_dilations_0 = const()[name = string("obj_171_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_171_groups_0 = const()[name = string("obj_171_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_20_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1015370624))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1019564992))))[name = string("layers_20_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_171_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_171_dilations_0, groups = obj_171_groups_0, pad = obj_171_pad_0, pad_type = obj_171_pad_type_0, strides = obj_171_strides_0, weight = layers_20_self_attn_o_proj_weight_to_fp16_palettized, x = input_161_cast_fp16)[name = string("obj_171_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_165_cast_fp16 = add(x = inputs_159_cast_fp16, y = obj_171_cast_fp16)[name = string("inputs_165_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_167_cast_fp16 = mul(x = inputs_165_cast_fp16, y = inputs_165_cast_fp16)[name = string("inputs_sq_167_cast_fp16")];
+            tensor<int32, [1]> variance_167_axes_0 = const()[name = string("variance_167_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_167_keep_dims_0 = const()[name = string("variance_167_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_167_cast_fp16 = reduce_mean(axes = variance_167_axes_0, keep_dims = variance_167_keep_dims_0, x = inputs_sq_167_cast_fp16)[name = string("variance_167_cast_fp16")];
+            fp16 var_8253_to_fp16 = const()[name = string("op_8253_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8254_cast_fp16 = add(x = variance_167_cast_fp16, y = var_8253_to_fp16)[name = string("op_8254_cast_fp16")];
+            fp32 var_8255_epsilon_0 = const()[name = string("op_8255_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8255_cast_fp16 = rsqrt(epsilon = var_8255_epsilon_0, x = var_8254_cast_fp16)[name = string("op_8255_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_207_cast_fp16 = mul(x = inputs_165_cast_fp16, y = var_8255_cast_fp16)[name = string("hidden_states_207_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_167_to_fp16 = const()[name = string("w_167_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1019565568)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_163_cast_fp16 = mul(x = w_167_to_fp16, y = hidden_states_207_cast_fp16)[name = string("input_163_cast_fp16")];
+            string input_165_pad_type_0 = const()[name = string("input_165_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_165_strides_0 = const()[name = string("input_165_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_165_pad_0 = const()[name = string("input_165_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_165_dilations_0 = const()[name = string("input_165_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_165_groups_0 = const()[name = string("input_165_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_20_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1019569728))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1032152704))))[name = string("layers_20_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_165_cast_fp16 = conv(dilations = input_165_dilations_0, groups = input_165_groups_0, pad = input_165_pad_0, pad_type = input_165_pad_type_0, strides = input_165_strides_0, weight = layers_20_mlp_gate_proj_weight_to_fp16_palettized, x = input_163_cast_fp16)[name = string("input_165_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_8269_cast_fp16 = silu(x = input_165_cast_fp16)[name = string("op_8269_cast_fp16")];
+            string var_8275_pad_type_0 = const()[name = string("op_8275_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8275_strides_0 = const()[name = string("op_8275_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8275_pad_0 = const()[name = string("op_8275_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8275_dilations_0 = const()[name = string("op_8275_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8275_groups_0 = const()[name = string("op_8275_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_20_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1032153280))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1044736256))))[name = string("layers_20_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_8275_cast_fp16 = conv(dilations = var_8275_dilations_0, groups = var_8275_groups_0, pad = var_8275_pad_0, pad_type = var_8275_pad_type_0, strides = var_8275_strides_0, weight = layers_20_mlp_up_proj_weight_to_fp16_palettized, x = input_163_cast_fp16)[name = string("op_8275_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_167_cast_fp16 = mul(x = var_8269_cast_fp16, y = var_8275_cast_fp16)[name = string("input_167_cast_fp16")];
+            string hidden_states_209_pad_type_0 = const()[name = string("hidden_states_209_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_209_strides_0 = const()[name = string("hidden_states_209_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_209_pad_0 = const()[name = string("hidden_states_209_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_209_dilations_0 = const()[name = string("hidden_states_209_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_209_groups_0 = const()[name = string("hidden_states_209_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_20_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1044736832))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1057319808))))[name = string("layers_20_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_209_cast_fp16 = conv(dilations = hidden_states_209_dilations_0, groups = hidden_states_209_groups_0, pad = hidden_states_209_pad_0, pad_type = hidden_states_209_pad_type_0, strides = hidden_states_209_strides_0, weight = layers_20_mlp_down_proj_weight_to_fp16_palettized, x = input_167_cast_fp16)[name = string("hidden_states_209_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_167_cast_fp16 = add(x = inputs_165_cast_fp16, y = hidden_states_209_cast_fp16)[name = string("inputs_167_cast_fp16")];
+            int32 var_8289 = const()[name = string("op_8289"), val = int32(3)];
+            int32 var_8299 = const()[name = string("op_8299"), val = int32(-2)];
+            int32 var_8307 = const()[name = string("op_8307"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_169_cast_fp16 = mul(x = inputs_167_cast_fp16, y = inputs_167_cast_fp16)[name = string("inputs_sq_169_cast_fp16")];
+            tensor<int32, [1]> variance_169_axes_0 = const()[name = string("variance_169_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_169_keep_dims_0 = const()[name = string("variance_169_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_169_cast_fp16 = reduce_mean(axes = variance_169_axes_0, keep_dims = variance_169_keep_dims_0, x = inputs_sq_169_cast_fp16)[name = string("variance_169_cast_fp16")];
+            fp16 var_8319_to_fp16 = const()[name = string("op_8319_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8320_cast_fp16 = add(x = variance_169_cast_fp16, y = var_8319_to_fp16)[name = string("op_8320_cast_fp16")];
+            fp32 var_8321_epsilon_0 = const()[name = string("op_8321_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8321_cast_fp16 = rsqrt(epsilon = var_8321_epsilon_0, x = var_8320_cast_fp16)[name = string("op_8321_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_211_cast_fp16 = mul(x = inputs_167_cast_fp16, y = var_8321_cast_fp16)[name = string("hidden_states_211_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_169_to_fp16 = const()[name = string("w_169_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1057320384)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_173_cast_fp16 = mul(x = w_169_to_fp16, y = hidden_states_211_cast_fp16)[name = string("obj_173_cast_fp16")];
+            string query_127_pad_type_0 = const()[name = string("query_127_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_127_strides_0 = const()[name = string("query_127_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_127_pad_0 = const()[name = string("query_127_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_127_dilations_0 = const()[name = string("query_127_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_127_groups_0 = const()[name = string("query_127_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_21_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1057324544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1061518912))))[name = string("layers_21_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_127_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_127_dilations_0, groups = query_127_groups_0, pad = query_127_pad_0, pad_type = query_127_pad_type_0, strides = query_127_strides_0, weight = layers_21_self_attn_q_proj_weight_to_fp16_palettized, x = obj_173_cast_fp16)[name = string("query_127_cast_fp16")];
+            string current_key_85_pad_type_0 = const()[name = string("current_key_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_85_strides_0 = const()[name = string("current_key_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_85_pad_0 = const()[name = string("current_key_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_85_dilations_0 = const()[name = string("current_key_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_85_groups_0 = const()[name = string("current_key_85_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_21_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1061519488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1063616704))))[name = string("layers_21_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_85_cast_fp16 = conv(dilations = current_key_85_dilations_0, groups = current_key_85_groups_0, pad = current_key_85_pad_0, pad_type = current_key_85_pad_type_0, strides = current_key_85_strides_0, weight = layers_21_self_attn_k_proj_weight_to_fp16_palettized, x = obj_173_cast_fp16)[name = string("current_key_85_cast_fp16")];
+            string current_value_43_pad_type_0 = const()[name = string("current_value_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_43_strides_0 = const()[name = string("current_value_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_43_pad_0 = const()[name = string("current_value_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_43_dilations_0 = const()[name = string("current_value_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_43_groups_0 = const()[name = string("current_value_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_21_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1063617280))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1065714496))))[name = string("layers_21_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_43_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_43_dilations_0, groups = current_value_43_groups_0, pad = current_value_43_pad_0, pad_type = current_value_43_pad_type_0, strides = current_value_43_strides_0, weight = layers_21_self_attn_v_proj_weight_to_fp16_palettized, x = obj_173_cast_fp16)[name = string("current_value_43_cast_fp16")];
+            tensor<int32, [4]> var_8358 = const()[name = string("op_8358"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_169_cast_fp16 = reshape(shape = var_8358, x = query_127_cast_fp16)[name = string("inputs_169_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_171_cast_fp16 = mul(x = inputs_169_cast_fp16, y = inputs_169_cast_fp16)[name = string("inputs_sq_171_cast_fp16")];
+            tensor<int32, [1]> variance_171_axes_0 = const()[name = string("variance_171_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_171_keep_dims_0 = const()[name = string("variance_171_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_171_cast_fp16 = reduce_mean(axes = variance_171_axes_0, keep_dims = variance_171_keep_dims_0, x = inputs_sq_171_cast_fp16)[name = string("variance_171_cast_fp16")];
+            fp16 var_8364_to_fp16 = const()[name = string("op_8364_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_8365_cast_fp16 = add(x = variance_171_cast_fp16, y = var_8364_to_fp16)[name = string("op_8365_cast_fp16")];
+            fp32 var_8366_epsilon_0 = const()[name = string("op_8366_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_8366_cast_fp16 = rsqrt(epsilon = var_8366_epsilon_0, x = var_8365_cast_fp16)[name = string("op_8366_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_213_cast_fp16 = mul(x = inputs_169_cast_fp16, y = var_8366_cast_fp16)[name = string("hidden_states_213_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_171_to_fp16 = const()[name = string("w_171_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1065715072)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_43_cast_fp16 = mul(x = w_171_to_fp16, y = hidden_states_213_cast_fp16)[name = string("query_normed_43_cast_fp16")];
+            tensor<int32, [4]> var_8374 = const()[name = string("op_8374"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_171_cast_fp16 = reshape(shape = var_8374, x = current_key_85_cast_fp16)[name = string("inputs_171_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_173_cast_fp16 = mul(x = inputs_171_cast_fp16, y = inputs_171_cast_fp16)[name = string("inputs_sq_173_cast_fp16")];
+            tensor<int32, [1]> variance_173_axes_0 = const()[name = string("variance_173_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_173_keep_dims_0 = const()[name = string("variance_173_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_173_cast_fp16 = reduce_mean(axes = variance_173_axes_0, keep_dims = variance_173_keep_dims_0, x = inputs_sq_173_cast_fp16)[name = string("variance_173_cast_fp16")];
+            fp16 var_8380_to_fp16 = const()[name = string("op_8380_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_8381_cast_fp16 = add(x = variance_173_cast_fp16, y = var_8380_to_fp16)[name = string("op_8381_cast_fp16")];
+            fp32 var_8382_epsilon_0 = const()[name = string("op_8382_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_8382_cast_fp16 = rsqrt(epsilon = var_8382_epsilon_0, x = var_8381_cast_fp16)[name = string("op_8382_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_215_cast_fp16 = mul(x = inputs_171_cast_fp16, y = var_8382_cast_fp16)[name = string("hidden_states_215_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_173_to_fp16 = const()[name = string("w_173_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1065715392)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_43_cast_fp16 = mul(x = w_173_to_fp16, y = hidden_states_215_cast_fp16)[name = string("current_key_normed_43_cast_fp16")];
+            tensor<int32, [4]> var_8400 = const()[name = string("op_8400"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_127_cast_fp16 = reshape(shape = var_8400, x = query_normed_43_cast_fp16)[name = string("mh_q_127_cast_fp16")];
+            tensor<int32, [4]> var_8402 = const()[name = string("op_8402"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_85_cast_fp16 = reshape(shape = var_8402, x = current_key_normed_43_cast_fp16)[name = string("mh_k_85_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8406_cast_fp16 = mul(x = mh_q_127_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8406_cast_fp16")];
+            tensor<int32, [4]> var_8411_begin_0 = const()[name = string("op_8411_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8411_end_0 = const()[name = string("op_8411_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_8411_end_mask_0 = const()[name = string("op_8411_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8411_cast_fp16 = slice_by_index(begin = var_8411_begin_0, end = var_8411_end_0, end_mask = var_8411_end_mask_0, x = mh_q_127_cast_fp16)[name = string("op_8411_cast_fp16")];
+            tensor<int32, [4]> var_8417_begin_0 = const()[name = string("op_8417_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8417_end_0 = const()[name = string("op_8417_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_8417_end_mask_0 = const()[name = string("op_8417_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8417_cast_fp16 = slice_by_index(begin = var_8417_begin_0, end = var_8417_end_0, end_mask = var_8417_end_mask_0, x = mh_q_127_cast_fp16)[name = string("op_8417_cast_fp16")];
+            fp16 const_500_promoted_to_fp16 = const()[name = string("const_500_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_8419_cast_fp16 = mul(x = var_8417_cast_fp16, y = const_500_promoted_to_fp16)[name = string("op_8419_cast_fp16")];
+            bool var_8421_interleave_0 = const()[name = string("op_8421_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_8421_cast_fp16 = concat(axis = var_8299, interleave = var_8421_interleave_0, values = (var_8419_cast_fp16, var_8411_cast_fp16))[name = string("op_8421_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8422_cast_fp16 = mul(x = var_8421_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8422_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_129_cast_fp16 = add(x = var_8406_cast_fp16, y = var_8422_cast_fp16)[name = string("mh_q_129_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8424_cast_fp16 = mul(x = mh_k_85_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8424_cast_fp16")];
+            tensor<int32, [4]> var_8429_begin_0 = const()[name = string("op_8429_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8429_end_0 = const()[name = string("op_8429_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_8429_end_mask_0 = const()[name = string("op_8429_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8429_cast_fp16 = slice_by_index(begin = var_8429_begin_0, end = var_8429_end_0, end_mask = var_8429_end_mask_0, x = mh_k_85_cast_fp16)[name = string("op_8429_cast_fp16")];
+            tensor<int32, [4]> var_8435_begin_0 = const()[name = string("op_8435_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8435_end_0 = const()[name = string("op_8435_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_8435_end_mask_0 = const()[name = string("op_8435_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8435_cast_fp16 = slice_by_index(begin = var_8435_begin_0, end = var_8435_end_0, end_mask = var_8435_end_mask_0, x = mh_k_85_cast_fp16)[name = string("op_8435_cast_fp16")];
+            fp16 const_503_promoted_to_fp16 = const()[name = string("const_503_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_8437_cast_fp16 = mul(x = var_8435_cast_fp16, y = const_503_promoted_to_fp16)[name = string("op_8437_cast_fp16")];
+            bool var_8439_interleave_0 = const()[name = string("op_8439_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_8439_cast_fp16 = concat(axis = var_8299, interleave = var_8439_interleave_0, values = (var_8437_cast_fp16, var_8429_cast_fp16))[name = string("op_8439_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8440_cast_fp16 = mul(x = var_8439_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8440_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_87_cast_fp16 = add(x = var_8424_cast_fp16, y = var_8440_cast_fp16)[name = string("mh_k_87_cast_fp16")];
+            tensor<int32, [4]> var_8444 = const()[name = string("op_8444"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_87_cast_fp16 = reshape(shape = var_8444, x = mh_k_87_cast_fp16)[name = string("current_key_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8451_cast_fp16 = mul(x = var_101_cast_fp16_21, y = var_323_cast_fp16)[name = string("op_8451_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8452_cast_fp16 = mul(x = current_key_87_cast_fp16, y = var_321_cast_fp16)[name = string("op_8452_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_129_cast_fp16 = add(x = var_8451_cast_fp16, y = var_8452_cast_fp16)[name = string("key_129_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8455_cast_fp16 = mul(x = var_132_cast_fp16_21, y = var_323_cast_fp16)[name = string("op_8455_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8456_cast_fp16 = mul(x = current_value_43_cast_fp16, y = var_321_cast_fp16)[name = string("op_8456_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_85_cast_fp16 = add(x = var_8455_cast_fp16, y = var_8456_cast_fp16)[name = string("value_85_cast_fp16")];
+            tensor<int32, [4]> var_8460 = const()[name = string("op_8460"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_85_cast_fp16 = reshape(shape = var_8460, x = key_129_cast_fp16)[name = string("key_heads_85_cast_fp16")];
+            tensor<int32, [4]> var_8462 = const()[name = string("op_8462"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_85_cast_fp16 = reshape(shape = var_8462, x = value_85_cast_fp16)[name = string("value_heads_85_cast_fp16")];
+            tensor<int32, [4]> var_8465_begin_0 = const()[name = string("op_8465_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8465_end_0 = const()[name = string("op_8465_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8465_end_mask_0 = const()[name = string("op_8465_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8465_cast_fp16 = slice_by_index(begin = var_8465_begin_0, end = var_8465_end_0, end_mask = var_8465_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8465_cast_fp16")];
+            tensor<int32, [4]> var_8469_begin_0 = const()[name = string("op_8469_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8469_end_0 = const()[name = string("op_8469_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8469_end_mask_0 = const()[name = string("op_8469_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8469_cast_fp16 = slice_by_index(begin = var_8469_begin_0, end = var_8469_end_0, end_mask = var_8469_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8469_cast_fp16")];
+            tensor<int32, [4]> var_8481_begin_0 = const()[name = string("op_8481_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8481_end_0 = const()[name = string("op_8481_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8481_end_mask_0 = const()[name = string("op_8481_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8481_cast_fp16 = slice_by_index(begin = var_8481_begin_0, end = var_8481_end_0, end_mask = var_8481_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8481_cast_fp16")];
+            tensor<int32, [4]> var_8485_begin_0 = const()[name = string("op_8485_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8485_end_0 = const()[name = string("op_8485_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8485_end_mask_0 = const()[name = string("op_8485_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8485_cast_fp16 = slice_by_index(begin = var_8485_begin_0, end = var_8485_end_0, end_mask = var_8485_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8485_cast_fp16")];
+            tensor<int32, [4]> var_8497_begin_0 = const()[name = string("op_8497_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8497_end_0 = const()[name = string("op_8497_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8497_end_mask_0 = const()[name = string("op_8497_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8497_cast_fp16 = slice_by_index(begin = var_8497_begin_0, end = var_8497_end_0, end_mask = var_8497_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8497_cast_fp16")];
+            tensor<int32, [4]> var_8501_begin_0 = const()[name = string("op_8501_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8501_end_0 = const()[name = string("op_8501_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8501_end_mask_0 = const()[name = string("op_8501_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8501_cast_fp16 = slice_by_index(begin = var_8501_begin_0, end = var_8501_end_0, end_mask = var_8501_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8501_cast_fp16")];
+            tensor<int32, [4]> var_8513_begin_0 = const()[name = string("op_8513_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8513_end_0 = const()[name = string("op_8513_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8513_end_mask_0 = const()[name = string("op_8513_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8513_cast_fp16 = slice_by_index(begin = var_8513_begin_0, end = var_8513_end_0, end_mask = var_8513_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8513_cast_fp16")];
+            tensor<int32, [4]> var_8517_begin_0 = const()[name = string("op_8517_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8517_end_0 = const()[name = string("op_8517_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8517_end_mask_0 = const()[name = string("op_8517_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8517_cast_fp16 = slice_by_index(begin = var_8517_begin_0, end = var_8517_end_0, end_mask = var_8517_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8517_cast_fp16")];
+            tensor<int32, [4]> var_8529_begin_0 = const()[name = string("op_8529_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8529_end_0 = const()[name = string("op_8529_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8529_end_mask_0 = const()[name = string("op_8529_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8529_cast_fp16 = slice_by_index(begin = var_8529_begin_0, end = var_8529_end_0, end_mask = var_8529_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8529_cast_fp16")];
+            tensor<int32, [4]> var_8533_begin_0 = const()[name = string("op_8533_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8533_end_0 = const()[name = string("op_8533_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8533_end_mask_0 = const()[name = string("op_8533_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8533_cast_fp16 = slice_by_index(begin = var_8533_begin_0, end = var_8533_end_0, end_mask = var_8533_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8533_cast_fp16")];
+            tensor<int32, [4]> var_8545_begin_0 = const()[name = string("op_8545_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8545_end_0 = const()[name = string("op_8545_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8545_end_mask_0 = const()[name = string("op_8545_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8545_cast_fp16 = slice_by_index(begin = var_8545_begin_0, end = var_8545_end_0, end_mask = var_8545_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8545_cast_fp16")];
+            tensor<int32, [4]> var_8549_begin_0 = const()[name = string("op_8549_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8549_end_0 = const()[name = string("op_8549_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8549_end_mask_0 = const()[name = string("op_8549_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8549_cast_fp16 = slice_by_index(begin = var_8549_begin_0, end = var_8549_end_0, end_mask = var_8549_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8549_cast_fp16")];
+            tensor<int32, [4]> var_8561_begin_0 = const()[name = string("op_8561_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8561_end_0 = const()[name = string("op_8561_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8561_end_mask_0 = const()[name = string("op_8561_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8561_cast_fp16 = slice_by_index(begin = var_8561_begin_0, end = var_8561_end_0, end_mask = var_8561_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8561_cast_fp16")];
+            tensor<int32, [4]> var_8565_begin_0 = const()[name = string("op_8565_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8565_end_0 = const()[name = string("op_8565_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8565_end_mask_0 = const()[name = string("op_8565_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8565_cast_fp16 = slice_by_index(begin = var_8565_begin_0, end = var_8565_end_0, end_mask = var_8565_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8565_cast_fp16")];
+            tensor<int32, [4]> var_8577_begin_0 = const()[name = string("op_8577_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8577_end_0 = const()[name = string("op_8577_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8577_end_mask_0 = const()[name = string("op_8577_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8577_cast_fp16 = slice_by_index(begin = var_8577_begin_0, end = var_8577_end_0, end_mask = var_8577_end_mask_0, x = key_heads_85_cast_fp16)[name = string("op_8577_cast_fp16")];
+            tensor<int32, [4]> var_8581_begin_0 = const()[name = string("op_8581_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8581_end_0 = const()[name = string("op_8581_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8581_end_mask_0 = const()[name = string("op_8581_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8581_cast_fp16 = slice_by_index(begin = var_8581_begin_0, end = var_8581_end_0, end_mask = var_8581_end_mask_0, x = value_heads_85_cast_fp16)[name = string("op_8581_cast_fp16")];
+            bool key_heads_87_interleave_0 = const()[name = string("key_heads_87_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_87_cast_fp16 = concat(axis = var_8307, interleave = key_heads_87_interleave_0, values = (var_8465_cast_fp16, var_8465_cast_fp16, var_8481_cast_fp16, var_8481_cast_fp16, var_8497_cast_fp16, var_8497_cast_fp16, var_8513_cast_fp16, var_8513_cast_fp16, var_8529_cast_fp16, var_8529_cast_fp16, var_8545_cast_fp16, var_8545_cast_fp16, var_8561_cast_fp16, var_8561_cast_fp16, var_8577_cast_fp16, var_8577_cast_fp16))[name = string("key_heads_87_cast_fp16")];
+            bool value_heads_87_interleave_0 = const()[name = string("value_heads_87_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_87_cast_fp16 = concat(axis = var_8307, interleave = value_heads_87_interleave_0, values = (var_8469_cast_fp16, var_8469_cast_fp16, var_8485_cast_fp16, var_8485_cast_fp16, var_8501_cast_fp16, var_8501_cast_fp16, var_8517_cast_fp16, var_8517_cast_fp16, var_8533_cast_fp16, var_8533_cast_fp16, var_8549_cast_fp16, var_8549_cast_fp16, var_8565_cast_fp16, var_8565_cast_fp16, var_8581_cast_fp16, var_8581_cast_fp16))[name = string("value_heads_87_cast_fp16")];
+            fp16 var_8604_to_fp16 = const()[name = string("op_8604_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_8605_cast_fp16 = mul(x = mh_q_129_cast_fp16, y = var_8604_to_fp16)[name = string("op_8605_cast_fp16")];
+            bool mh_w_85_transpose_x_0 = const()[name = string("mh_w_85_transpose_x_0"), val = bool(true)];
+            bool mh_w_85_transpose_y_0 = const()[name = string("mh_w_85_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_85_cast_fp16 = matmul(transpose_x = mh_w_85_transpose_x_0, transpose_y = mh_w_85_transpose_y_0, x = var_8605_cast_fp16, y = key_heads_87_cast_fp16)[name = string("mh_w_85_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_87_cast_fp16 = add(x = mh_w_85_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_87_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_8617_cast_fp16 = softmax(axis = var_8289, x = mh_w_87_cast_fp16)[name = string("op_8617_cast_fp16")];
+            bool attn_43_transpose_x_0 = const()[name = string("attn_43_transpose_x_0"), val = bool(false)];
+            bool attn_43_transpose_y_0 = const()[name = string("attn_43_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_43_cast_fp16 = matmul(transpose_x = attn_43_transpose_x_0, transpose_y = attn_43_transpose_y_0, x = value_heads_87_cast_fp16, y = var_8617_cast_fp16)[name = string("attn_43_cast_fp16")];
+            tensor<int32, [4]> var_8622 = const()[name = string("op_8622"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_169_cast_fp16 = reshape(shape = var_8622, x = attn_43_cast_fp16)[name = string("input_169_cast_fp16")];
+            string obj_179_pad_type_0 = const()[name = string("obj_179_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_179_strides_0 = const()[name = string("obj_179_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_179_pad_0 = const()[name = string("obj_179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_179_dilations_0 = const()[name = string("obj_179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_179_groups_0 = const()[name = string("obj_179_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_21_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1065715712))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1069910080))))[name = string("layers_21_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_179_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_179_dilations_0, groups = obj_179_groups_0, pad = obj_179_pad_0, pad_type = obj_179_pad_type_0, strides = obj_179_strides_0, weight = layers_21_self_attn_o_proj_weight_to_fp16_palettized, x = input_169_cast_fp16)[name = string("obj_179_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_173_cast_fp16 = add(x = inputs_167_cast_fp16, y = obj_179_cast_fp16)[name = string("inputs_173_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_175_cast_fp16 = mul(x = inputs_173_cast_fp16, y = inputs_173_cast_fp16)[name = string("inputs_sq_175_cast_fp16")];
+            tensor<int32, [1]> variance_175_axes_0 = const()[name = string("variance_175_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_175_keep_dims_0 = const()[name = string("variance_175_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_175_cast_fp16 = reduce_mean(axes = variance_175_axes_0, keep_dims = variance_175_keep_dims_0, x = inputs_sq_175_cast_fp16)[name = string("variance_175_cast_fp16")];
+            fp16 var_8640_to_fp16 = const()[name = string("op_8640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8641_cast_fp16 = add(x = variance_175_cast_fp16, y = var_8640_to_fp16)[name = string("op_8641_cast_fp16")];
+            fp32 var_8642_epsilon_0 = const()[name = string("op_8642_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8642_cast_fp16 = rsqrt(epsilon = var_8642_epsilon_0, x = var_8641_cast_fp16)[name = string("op_8642_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_217_cast_fp16 = mul(x = inputs_173_cast_fp16, y = var_8642_cast_fp16)[name = string("hidden_states_217_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_175_to_fp16 = const()[name = string("w_175_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1069910656)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_171_cast_fp16 = mul(x = w_175_to_fp16, y = hidden_states_217_cast_fp16)[name = string("input_171_cast_fp16")];
+            string input_173_pad_type_0 = const()[name = string("input_173_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_173_strides_0 = const()[name = string("input_173_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_173_pad_0 = const()[name = string("input_173_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_173_dilations_0 = const()[name = string("input_173_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_173_groups_0 = const()[name = string("input_173_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_21_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1069914816))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1082497792))))[name = string("layers_21_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_173_cast_fp16 = conv(dilations = input_173_dilations_0, groups = input_173_groups_0, pad = input_173_pad_0, pad_type = input_173_pad_type_0, strides = input_173_strides_0, weight = layers_21_mlp_gate_proj_weight_to_fp16_palettized, x = input_171_cast_fp16)[name = string("input_173_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_8656_cast_fp16 = silu(x = input_173_cast_fp16)[name = string("op_8656_cast_fp16")];
+            string var_8662_pad_type_0 = const()[name = string("op_8662_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8662_strides_0 = const()[name = string("op_8662_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8662_pad_0 = const()[name = string("op_8662_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8662_dilations_0 = const()[name = string("op_8662_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8662_groups_0 = const()[name = string("op_8662_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_21_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1082498368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1095081344))))[name = string("layers_21_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_8662_cast_fp16 = conv(dilations = var_8662_dilations_0, groups = var_8662_groups_0, pad = var_8662_pad_0, pad_type = var_8662_pad_type_0, strides = var_8662_strides_0, weight = layers_21_mlp_up_proj_weight_to_fp16_palettized, x = input_171_cast_fp16)[name = string("op_8662_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_175_cast_fp16 = mul(x = var_8656_cast_fp16, y = var_8662_cast_fp16)[name = string("input_175_cast_fp16")];
+            string hidden_states_219_pad_type_0 = const()[name = string("hidden_states_219_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_219_strides_0 = const()[name = string("hidden_states_219_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_219_pad_0 = const()[name = string("hidden_states_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_219_dilations_0 = const()[name = string("hidden_states_219_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_219_groups_0 = const()[name = string("hidden_states_219_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_21_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1095081920))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1107664896))))[name = string("layers_21_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_219_cast_fp16 = conv(dilations = hidden_states_219_dilations_0, groups = hidden_states_219_groups_0, pad = hidden_states_219_pad_0, pad_type = hidden_states_219_pad_type_0, strides = hidden_states_219_strides_0, weight = layers_21_mlp_down_proj_weight_to_fp16_palettized, x = input_175_cast_fp16)[name = string("hidden_states_219_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_175_cast_fp16 = add(x = inputs_173_cast_fp16, y = hidden_states_219_cast_fp16)[name = string("inputs_175_cast_fp16")];
+            int32 var_8676 = const()[name = string("op_8676"), val = int32(3)];
+            int32 var_8686 = const()[name = string("op_8686"), val = int32(-2)];
+            int32 var_8694 = const()[name = string("op_8694"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_177_cast_fp16 = mul(x = inputs_175_cast_fp16, y = inputs_175_cast_fp16)[name = string("inputs_sq_177_cast_fp16")];
+            tensor<int32, [1]> variance_177_axes_0 = const()[name = string("variance_177_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_177_keep_dims_0 = const()[name = string("variance_177_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_177_cast_fp16 = reduce_mean(axes = variance_177_axes_0, keep_dims = variance_177_keep_dims_0, x = inputs_sq_177_cast_fp16)[name = string("variance_177_cast_fp16")];
+            fp16 var_8706_to_fp16 = const()[name = string("op_8706_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_8707_cast_fp16 = add(x = variance_177_cast_fp16, y = var_8706_to_fp16)[name = string("op_8707_cast_fp16")];
+            fp32 var_8708_epsilon_0 = const()[name = string("op_8708_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_8708_cast_fp16 = rsqrt(epsilon = var_8708_epsilon_0, x = var_8707_cast_fp16)[name = string("op_8708_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_221_cast_fp16 = mul(x = inputs_175_cast_fp16, y = var_8708_cast_fp16)[name = string("hidden_states_221_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_177_to_fp16 = const()[name = string("w_177_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1107665472)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_181_cast_fp16 = mul(x = w_177_to_fp16, y = hidden_states_221_cast_fp16)[name = string("obj_181_cast_fp16")];
+            string query_133_pad_type_0 = const()[name = string("query_133_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_133_strides_0 = const()[name = string("query_133_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_133_pad_0 = const()[name = string("query_133_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_133_dilations_0 = const()[name = string("query_133_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_133_groups_0 = const()[name = string("query_133_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_22_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1107669632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1111864000))))[name = string("layers_22_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_133_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_133_dilations_0, groups = query_133_groups_0, pad = query_133_pad_0, pad_type = query_133_pad_type_0, strides = query_133_strides_0, weight = layers_22_self_attn_q_proj_weight_to_fp16_palettized, x = obj_181_cast_fp16)[name = string("query_133_cast_fp16")];
+            string current_key_89_pad_type_0 = const()[name = string("current_key_89_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_89_strides_0 = const()[name = string("current_key_89_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_89_pad_0 = const()[name = string("current_key_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_89_dilations_0 = const()[name = string("current_key_89_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_89_groups_0 = const()[name = string("current_key_89_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_22_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1111864576))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1113961792))))[name = string("layers_22_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_89_cast_fp16 = conv(dilations = current_key_89_dilations_0, groups = current_key_89_groups_0, pad = current_key_89_pad_0, pad_type = current_key_89_pad_type_0, strides = current_key_89_strides_0, weight = layers_22_self_attn_k_proj_weight_to_fp16_palettized, x = obj_181_cast_fp16)[name = string("current_key_89_cast_fp16")];
+            string current_value_45_pad_type_0 = const()[name = string("current_value_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_45_strides_0 = const()[name = string("current_value_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_45_pad_0 = const()[name = string("current_value_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_45_dilations_0 = const()[name = string("current_value_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_45_groups_0 = const()[name = string("current_value_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_22_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1113962368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1116059584))))[name = string("layers_22_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_45_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_45_dilations_0, groups = current_value_45_groups_0, pad = current_value_45_pad_0, pad_type = current_value_45_pad_type_0, strides = current_value_45_strides_0, weight = layers_22_self_attn_v_proj_weight_to_fp16_palettized, x = obj_181_cast_fp16)[name = string("current_value_45_cast_fp16")];
+            tensor<int32, [4]> var_8745 = const()[name = string("op_8745"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_177_cast_fp16 = reshape(shape = var_8745, x = query_133_cast_fp16)[name = string("inputs_177_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_179_cast_fp16 = mul(x = inputs_177_cast_fp16, y = inputs_177_cast_fp16)[name = string("inputs_sq_179_cast_fp16")];
+            tensor<int32, [1]> variance_179_axes_0 = const()[name = string("variance_179_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_179_keep_dims_0 = const()[name = string("variance_179_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_179_cast_fp16 = reduce_mean(axes = variance_179_axes_0, keep_dims = variance_179_keep_dims_0, x = inputs_sq_179_cast_fp16)[name = string("variance_179_cast_fp16")];
+            fp16 var_8751_to_fp16 = const()[name = string("op_8751_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_8752_cast_fp16 = add(x = variance_179_cast_fp16, y = var_8751_to_fp16)[name = string("op_8752_cast_fp16")];
+            fp32 var_8753_epsilon_0 = const()[name = string("op_8753_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_8753_cast_fp16 = rsqrt(epsilon = var_8753_epsilon_0, x = var_8752_cast_fp16)[name = string("op_8753_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_223_cast_fp16 = mul(x = inputs_177_cast_fp16, y = var_8753_cast_fp16)[name = string("hidden_states_223_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_179_to_fp16 = const()[name = string("w_179_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1116060160)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_45_cast_fp16 = mul(x = w_179_to_fp16, y = hidden_states_223_cast_fp16)[name = string("query_normed_45_cast_fp16")];
+            tensor<int32, [4]> var_8761 = const()[name = string("op_8761"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_179_cast_fp16 = reshape(shape = var_8761, x = current_key_89_cast_fp16)[name = string("inputs_179_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_181_cast_fp16 = mul(x = inputs_179_cast_fp16, y = inputs_179_cast_fp16)[name = string("inputs_sq_181_cast_fp16")];
+            tensor<int32, [1]> variance_181_axes_0 = const()[name = string("variance_181_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_181_keep_dims_0 = const()[name = string("variance_181_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_181_cast_fp16 = reduce_mean(axes = variance_181_axes_0, keep_dims = variance_181_keep_dims_0, x = inputs_sq_181_cast_fp16)[name = string("variance_181_cast_fp16")];
+            fp16 var_8767_to_fp16 = const()[name = string("op_8767_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_8768_cast_fp16 = add(x = variance_181_cast_fp16, y = var_8767_to_fp16)[name = string("op_8768_cast_fp16")];
+            fp32 var_8769_epsilon_0 = const()[name = string("op_8769_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_8769_cast_fp16 = rsqrt(epsilon = var_8769_epsilon_0, x = var_8768_cast_fp16)[name = string("op_8769_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_225_cast_fp16 = mul(x = inputs_179_cast_fp16, y = var_8769_cast_fp16)[name = string("hidden_states_225_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_181_to_fp16 = const()[name = string("w_181_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1116060480)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_45_cast_fp16 = mul(x = w_181_to_fp16, y = hidden_states_225_cast_fp16)[name = string("current_key_normed_45_cast_fp16")];
+            tensor<int32, [4]> var_8787 = const()[name = string("op_8787"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_133_cast_fp16 = reshape(shape = var_8787, x = query_normed_45_cast_fp16)[name = string("mh_q_133_cast_fp16")];
+            tensor<int32, [4]> var_8789 = const()[name = string("op_8789"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_89_cast_fp16 = reshape(shape = var_8789, x = current_key_normed_45_cast_fp16)[name = string("mh_k_89_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8793_cast_fp16 = mul(x = mh_q_133_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8793_cast_fp16")];
+            tensor<int32, [4]> var_8798_begin_0 = const()[name = string("op_8798_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8798_end_0 = const()[name = string("op_8798_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_8798_end_mask_0 = const()[name = string("op_8798_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8798_cast_fp16 = slice_by_index(begin = var_8798_begin_0, end = var_8798_end_0, end_mask = var_8798_end_mask_0, x = mh_q_133_cast_fp16)[name = string("op_8798_cast_fp16")];
+            tensor<int32, [4]> var_8804_begin_0 = const()[name = string("op_8804_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8804_end_0 = const()[name = string("op_8804_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_8804_end_mask_0 = const()[name = string("op_8804_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_8804_cast_fp16 = slice_by_index(begin = var_8804_begin_0, end = var_8804_end_0, end_mask = var_8804_end_mask_0, x = mh_q_133_cast_fp16)[name = string("op_8804_cast_fp16")];
+            fp16 const_523_promoted_to_fp16 = const()[name = string("const_523_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_8806_cast_fp16 = mul(x = var_8804_cast_fp16, y = const_523_promoted_to_fp16)[name = string("op_8806_cast_fp16")];
+            bool var_8808_interleave_0 = const()[name = string("op_8808_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_8808_cast_fp16 = concat(axis = var_8686, interleave = var_8808_interleave_0, values = (var_8806_cast_fp16, var_8798_cast_fp16))[name = string("op_8808_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_8809_cast_fp16 = mul(x = var_8808_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8809_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_135_cast_fp16 = add(x = var_8793_cast_fp16, y = var_8809_cast_fp16)[name = string("mh_q_135_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8811_cast_fp16 = mul(x = mh_k_89_cast_fp16, y = cos_1_cast_fp16)[name = string("op_8811_cast_fp16")];
+            tensor<int32, [4]> var_8816_begin_0 = const()[name = string("op_8816_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8816_end_0 = const()[name = string("op_8816_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_8816_end_mask_0 = const()[name = string("op_8816_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8816_cast_fp16 = slice_by_index(begin = var_8816_begin_0, end = var_8816_end_0, end_mask = var_8816_end_mask_0, x = mh_k_89_cast_fp16)[name = string("op_8816_cast_fp16")];
+            tensor<int32, [4]> var_8822_begin_0 = const()[name = string("op_8822_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_8822_end_0 = const()[name = string("op_8822_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_8822_end_mask_0 = const()[name = string("op_8822_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_8822_cast_fp16 = slice_by_index(begin = var_8822_begin_0, end = var_8822_end_0, end_mask = var_8822_end_mask_0, x = mh_k_89_cast_fp16)[name = string("op_8822_cast_fp16")];
+            fp16 const_526_promoted_to_fp16 = const()[name = string("const_526_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_8824_cast_fp16 = mul(x = var_8822_cast_fp16, y = const_526_promoted_to_fp16)[name = string("op_8824_cast_fp16")];
+            bool var_8826_interleave_0 = const()[name = string("op_8826_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_8826_cast_fp16 = concat(axis = var_8686, interleave = var_8826_interleave_0, values = (var_8824_cast_fp16, var_8816_cast_fp16))[name = string("op_8826_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_8827_cast_fp16 = mul(x = var_8826_cast_fp16, y = sin_1_cast_fp16)[name = string("op_8827_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_91_cast_fp16 = add(x = var_8811_cast_fp16, y = var_8827_cast_fp16)[name = string("mh_k_91_cast_fp16")];
+            tensor<int32, [4]> var_8831 = const()[name = string("op_8831"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_91_cast_fp16 = reshape(shape = var_8831, x = mh_k_91_cast_fp16)[name = string("current_key_91_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8838_cast_fp16 = mul(x = var_101_cast_fp16_22, y = var_323_cast_fp16)[name = string("op_8838_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8839_cast_fp16 = mul(x = current_key_91_cast_fp16, y = var_321_cast_fp16)[name = string("op_8839_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_135_cast_fp16 = add(x = var_8838_cast_fp16, y = var_8839_cast_fp16)[name = string("key_135_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8842_cast_fp16 = mul(x = var_132_cast_fp16_22, y = var_323_cast_fp16)[name = string("op_8842_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_8843_cast_fp16 = mul(x = current_value_45_cast_fp16, y = var_321_cast_fp16)[name = string("op_8843_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_89_cast_fp16 = add(x = var_8842_cast_fp16, y = var_8843_cast_fp16)[name = string("value_89_cast_fp16")];
+            tensor<int32, [4]> var_8847 = const()[name = string("op_8847"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_89_cast_fp16 = reshape(shape = var_8847, x = key_135_cast_fp16)[name = string("key_heads_89_cast_fp16")];
+            tensor<int32, [4]> var_8849 = const()[name = string("op_8849"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_89_cast_fp16 = reshape(shape = var_8849, x = value_89_cast_fp16)[name = string("value_heads_89_cast_fp16")];
+            tensor<int32, [4]> var_8852_begin_0 = const()[name = string("op_8852_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8852_end_0 = const()[name = string("op_8852_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8852_end_mask_0 = const()[name = string("op_8852_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8852_cast_fp16 = slice_by_index(begin = var_8852_begin_0, end = var_8852_end_0, end_mask = var_8852_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8852_cast_fp16")];
+            tensor<int32, [4]> var_8856_begin_0 = const()[name = string("op_8856_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_8856_end_0 = const()[name = string("op_8856_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8856_end_mask_0 = const()[name = string("op_8856_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8856_cast_fp16 = slice_by_index(begin = var_8856_begin_0, end = var_8856_end_0, end_mask = var_8856_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8856_cast_fp16")];
+            tensor<int32, [4]> var_8868_begin_0 = const()[name = string("op_8868_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8868_end_0 = const()[name = string("op_8868_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8868_end_mask_0 = const()[name = string("op_8868_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8868_cast_fp16 = slice_by_index(begin = var_8868_begin_0, end = var_8868_end_0, end_mask = var_8868_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8868_cast_fp16")];
+            tensor<int32, [4]> var_8872_begin_0 = const()[name = string("op_8872_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_8872_end_0 = const()[name = string("op_8872_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_8872_end_mask_0 = const()[name = string("op_8872_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8872_cast_fp16 = slice_by_index(begin = var_8872_begin_0, end = var_8872_end_0, end_mask = var_8872_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8872_cast_fp16")];
+            tensor<int32, [4]> var_8884_begin_0 = const()[name = string("op_8884_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8884_end_0 = const()[name = string("op_8884_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8884_end_mask_0 = const()[name = string("op_8884_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8884_cast_fp16 = slice_by_index(begin = var_8884_begin_0, end = var_8884_end_0, end_mask = var_8884_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8884_cast_fp16")];
+            tensor<int32, [4]> var_8888_begin_0 = const()[name = string("op_8888_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_8888_end_0 = const()[name = string("op_8888_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_8888_end_mask_0 = const()[name = string("op_8888_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8888_cast_fp16 = slice_by_index(begin = var_8888_begin_0, end = var_8888_end_0, end_mask = var_8888_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8888_cast_fp16")];
+            tensor<int32, [4]> var_8900_begin_0 = const()[name = string("op_8900_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8900_end_0 = const()[name = string("op_8900_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8900_end_mask_0 = const()[name = string("op_8900_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8900_cast_fp16 = slice_by_index(begin = var_8900_begin_0, end = var_8900_end_0, end_mask = var_8900_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8900_cast_fp16")];
+            tensor<int32, [4]> var_8904_begin_0 = const()[name = string("op_8904_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_8904_end_0 = const()[name = string("op_8904_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_8904_end_mask_0 = const()[name = string("op_8904_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8904_cast_fp16 = slice_by_index(begin = var_8904_begin_0, end = var_8904_end_0, end_mask = var_8904_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8904_cast_fp16")];
+            tensor<int32, [4]> var_8916_begin_0 = const()[name = string("op_8916_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8916_end_0 = const()[name = string("op_8916_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8916_end_mask_0 = const()[name = string("op_8916_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8916_cast_fp16 = slice_by_index(begin = var_8916_begin_0, end = var_8916_end_0, end_mask = var_8916_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8916_cast_fp16")];
+            tensor<int32, [4]> var_8920_begin_0 = const()[name = string("op_8920_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_8920_end_0 = const()[name = string("op_8920_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_8920_end_mask_0 = const()[name = string("op_8920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8920_cast_fp16 = slice_by_index(begin = var_8920_begin_0, end = var_8920_end_0, end_mask = var_8920_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8920_cast_fp16")];
+            tensor<int32, [4]> var_8932_begin_0 = const()[name = string("op_8932_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8932_end_0 = const()[name = string("op_8932_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8932_end_mask_0 = const()[name = string("op_8932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8932_cast_fp16 = slice_by_index(begin = var_8932_begin_0, end = var_8932_end_0, end_mask = var_8932_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8932_cast_fp16")];
+            tensor<int32, [4]> var_8936_begin_0 = const()[name = string("op_8936_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_8936_end_0 = const()[name = string("op_8936_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_8936_end_mask_0 = const()[name = string("op_8936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8936_cast_fp16 = slice_by_index(begin = var_8936_begin_0, end = var_8936_end_0, end_mask = var_8936_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8936_cast_fp16")];
+            tensor<int32, [4]> var_8948_begin_0 = const()[name = string("op_8948_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8948_end_0 = const()[name = string("op_8948_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8948_end_mask_0 = const()[name = string("op_8948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8948_cast_fp16 = slice_by_index(begin = var_8948_begin_0, end = var_8948_end_0, end_mask = var_8948_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8948_cast_fp16")];
+            tensor<int32, [4]> var_8952_begin_0 = const()[name = string("op_8952_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_8952_end_0 = const()[name = string("op_8952_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_8952_end_mask_0 = const()[name = string("op_8952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8952_cast_fp16 = slice_by_index(begin = var_8952_begin_0, end = var_8952_end_0, end_mask = var_8952_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8952_cast_fp16")];
+            tensor<int32, [4]> var_8964_begin_0 = const()[name = string("op_8964_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8964_end_0 = const()[name = string("op_8964_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8964_end_mask_0 = const()[name = string("op_8964_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8964_cast_fp16 = slice_by_index(begin = var_8964_begin_0, end = var_8964_end_0, end_mask = var_8964_end_mask_0, x = key_heads_89_cast_fp16)[name = string("op_8964_cast_fp16")];
+            tensor<int32, [4]> var_8968_begin_0 = const()[name = string("op_8968_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_8968_end_0 = const()[name = string("op_8968_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_8968_end_mask_0 = const()[name = string("op_8968_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_8968_cast_fp16 = slice_by_index(begin = var_8968_begin_0, end = var_8968_end_0, end_mask = var_8968_end_mask_0, x = value_heads_89_cast_fp16)[name = string("op_8968_cast_fp16")];
+            bool key_heads_91_interleave_0 = const()[name = string("key_heads_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_91_cast_fp16 = concat(axis = var_8694, interleave = key_heads_91_interleave_0, values = (var_8852_cast_fp16, var_8852_cast_fp16, var_8868_cast_fp16, var_8868_cast_fp16, var_8884_cast_fp16, var_8884_cast_fp16, var_8900_cast_fp16, var_8900_cast_fp16, var_8916_cast_fp16, var_8916_cast_fp16, var_8932_cast_fp16, var_8932_cast_fp16, var_8948_cast_fp16, var_8948_cast_fp16, var_8964_cast_fp16, var_8964_cast_fp16))[name = string("key_heads_91_cast_fp16")];
+            bool value_heads_91_interleave_0 = const()[name = string("value_heads_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_91_cast_fp16 = concat(axis = var_8694, interleave = value_heads_91_interleave_0, values = (var_8856_cast_fp16, var_8856_cast_fp16, var_8872_cast_fp16, var_8872_cast_fp16, var_8888_cast_fp16, var_8888_cast_fp16, var_8904_cast_fp16, var_8904_cast_fp16, var_8920_cast_fp16, var_8920_cast_fp16, var_8936_cast_fp16, var_8936_cast_fp16, var_8952_cast_fp16, var_8952_cast_fp16, var_8968_cast_fp16, var_8968_cast_fp16))[name = string("value_heads_91_cast_fp16")];
+            fp16 var_8991_to_fp16 = const()[name = string("op_8991_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_8992_cast_fp16 = mul(x = mh_q_135_cast_fp16, y = var_8991_to_fp16)[name = string("op_8992_cast_fp16")];
+            bool mh_w_89_transpose_x_0 = const()[name = string("mh_w_89_transpose_x_0"), val = bool(true)];
+            bool mh_w_89_transpose_y_0 = const()[name = string("mh_w_89_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_89_cast_fp16 = matmul(transpose_x = mh_w_89_transpose_x_0, transpose_y = mh_w_89_transpose_y_0, x = var_8992_cast_fp16, y = key_heads_91_cast_fp16)[name = string("mh_w_89_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_91_cast_fp16 = add(x = mh_w_89_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_91_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_9004_cast_fp16 = softmax(axis = var_8676, x = mh_w_91_cast_fp16)[name = string("op_9004_cast_fp16")];
+            bool attn_45_transpose_x_0 = const()[name = string("attn_45_transpose_x_0"), val = bool(false)];
+            bool attn_45_transpose_y_0 = const()[name = string("attn_45_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_45_cast_fp16 = matmul(transpose_x = attn_45_transpose_x_0, transpose_y = attn_45_transpose_y_0, x = value_heads_91_cast_fp16, y = var_9004_cast_fp16)[name = string("attn_45_cast_fp16")];
+            tensor<int32, [4]> var_9009 = const()[name = string("op_9009"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_177_cast_fp16 = reshape(shape = var_9009, x = attn_45_cast_fp16)[name = string("input_177_cast_fp16")];
+            string obj_187_pad_type_0 = const()[name = string("obj_187_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_187_strides_0 = const()[name = string("obj_187_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_187_pad_0 = const()[name = string("obj_187_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_187_dilations_0 = const()[name = string("obj_187_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_187_groups_0 = const()[name = string("obj_187_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_22_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1116060800))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1120255168))))[name = string("layers_22_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_187_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_187_dilations_0, groups = obj_187_groups_0, pad = obj_187_pad_0, pad_type = obj_187_pad_type_0, strides = obj_187_strides_0, weight = layers_22_self_attn_o_proj_weight_to_fp16_palettized, x = input_177_cast_fp16)[name = string("obj_187_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_181_cast_fp16 = add(x = inputs_175_cast_fp16, y = obj_187_cast_fp16)[name = string("inputs_181_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_183_cast_fp16 = mul(x = inputs_181_cast_fp16, y = inputs_181_cast_fp16)[name = string("inputs_sq_183_cast_fp16")];
+            tensor<int32, [1]> variance_183_axes_0 = const()[name = string("variance_183_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_183_keep_dims_0 = const()[name = string("variance_183_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_183_cast_fp16 = reduce_mean(axes = variance_183_axes_0, keep_dims = variance_183_keep_dims_0, x = inputs_sq_183_cast_fp16)[name = string("variance_183_cast_fp16")];
+            fp16 var_9027_to_fp16 = const()[name = string("op_9027_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9028_cast_fp16 = add(x = variance_183_cast_fp16, y = var_9027_to_fp16)[name = string("op_9028_cast_fp16")];
+            fp32 var_9029_epsilon_0 = const()[name = string("op_9029_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9029_cast_fp16 = rsqrt(epsilon = var_9029_epsilon_0, x = var_9028_cast_fp16)[name = string("op_9029_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_227_cast_fp16 = mul(x = inputs_181_cast_fp16, y = var_9029_cast_fp16)[name = string("hidden_states_227_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_183_to_fp16 = const()[name = string("w_183_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1120255744)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_179_cast_fp16 = mul(x = w_183_to_fp16, y = hidden_states_227_cast_fp16)[name = string("input_179_cast_fp16")];
+            string input_181_pad_type_0 = const()[name = string("input_181_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_181_strides_0 = const()[name = string("input_181_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_181_pad_0 = const()[name = string("input_181_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_181_dilations_0 = const()[name = string("input_181_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_181_groups_0 = const()[name = string("input_181_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_22_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1120259904))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1132842880))))[name = string("layers_22_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_181_cast_fp16 = conv(dilations = input_181_dilations_0, groups = input_181_groups_0, pad = input_181_pad_0, pad_type = input_181_pad_type_0, strides = input_181_strides_0, weight = layers_22_mlp_gate_proj_weight_to_fp16_palettized, x = input_179_cast_fp16)[name = string("input_181_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_9043_cast_fp16 = silu(x = input_181_cast_fp16)[name = string("op_9043_cast_fp16")];
+            string var_9049_pad_type_0 = const()[name = string("op_9049_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9049_strides_0 = const()[name = string("op_9049_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9049_pad_0 = const()[name = string("op_9049_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9049_dilations_0 = const()[name = string("op_9049_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9049_groups_0 = const()[name = string("op_9049_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_22_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1132843456))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1145426432))))[name = string("layers_22_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_9049_cast_fp16 = conv(dilations = var_9049_dilations_0, groups = var_9049_groups_0, pad = var_9049_pad_0, pad_type = var_9049_pad_type_0, strides = var_9049_strides_0, weight = layers_22_mlp_up_proj_weight_to_fp16_palettized, x = input_179_cast_fp16)[name = string("op_9049_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_183_cast_fp16 = mul(x = var_9043_cast_fp16, y = var_9049_cast_fp16)[name = string("input_183_cast_fp16")];
+            string hidden_states_229_pad_type_0 = const()[name = string("hidden_states_229_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_229_strides_0 = const()[name = string("hidden_states_229_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_229_pad_0 = const()[name = string("hidden_states_229_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_229_dilations_0 = const()[name = string("hidden_states_229_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_229_groups_0 = const()[name = string("hidden_states_229_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_22_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1145427008))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1158009984))))[name = string("layers_22_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_229_cast_fp16 = conv(dilations = hidden_states_229_dilations_0, groups = hidden_states_229_groups_0, pad = hidden_states_229_pad_0, pad_type = hidden_states_229_pad_type_0, strides = hidden_states_229_strides_0, weight = layers_22_mlp_down_proj_weight_to_fp16_palettized, x = input_183_cast_fp16)[name = string("hidden_states_229_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_183_cast_fp16 = add(x = inputs_181_cast_fp16, y = hidden_states_229_cast_fp16)[name = string("inputs_183_cast_fp16")];
+            int32 var_9063 = const()[name = string("op_9063"), val = int32(3)];
+            int32 var_9073 = const()[name = string("op_9073"), val = int32(-2)];
+            int32 var_9081 = const()[name = string("op_9081"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_185_cast_fp16 = mul(x = inputs_183_cast_fp16, y = inputs_183_cast_fp16)[name = string("inputs_sq_185_cast_fp16")];
+            tensor<int32, [1]> variance_185_axes_0 = const()[name = string("variance_185_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_185_keep_dims_0 = const()[name = string("variance_185_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_185_cast_fp16 = reduce_mean(axes = variance_185_axes_0, keep_dims = variance_185_keep_dims_0, x = inputs_sq_185_cast_fp16)[name = string("variance_185_cast_fp16")];
+            fp16 var_9093_to_fp16 = const()[name = string("op_9093_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9094_cast_fp16 = add(x = variance_185_cast_fp16, y = var_9093_to_fp16)[name = string("op_9094_cast_fp16")];
+            fp32 var_9095_epsilon_0 = const()[name = string("op_9095_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9095_cast_fp16 = rsqrt(epsilon = var_9095_epsilon_0, x = var_9094_cast_fp16)[name = string("op_9095_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_231_cast_fp16 = mul(x = inputs_183_cast_fp16, y = var_9095_cast_fp16)[name = string("hidden_states_231_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_185_to_fp16 = const()[name = string("w_185_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1158010560)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_189_cast_fp16 = mul(x = w_185_to_fp16, y = hidden_states_231_cast_fp16)[name = string("obj_189_cast_fp16")];
+            string query_139_pad_type_0 = const()[name = string("query_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_139_strides_0 = const()[name = string("query_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_139_pad_0 = const()[name = string("query_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_139_dilations_0 = const()[name = string("query_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_139_groups_0 = const()[name = string("query_139_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_23_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1158014720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1162209088))))[name = string("layers_23_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_139_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_139_dilations_0, groups = query_139_groups_0, pad = query_139_pad_0, pad_type = query_139_pad_type_0, strides = query_139_strides_0, weight = layers_23_self_attn_q_proj_weight_to_fp16_palettized, x = obj_189_cast_fp16)[name = string("query_139_cast_fp16")];
+            string current_key_93_pad_type_0 = const()[name = string("current_key_93_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_93_strides_0 = const()[name = string("current_key_93_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_93_pad_0 = const()[name = string("current_key_93_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_93_dilations_0 = const()[name = string("current_key_93_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_93_groups_0 = const()[name = string("current_key_93_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_23_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1162209664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1164306880))))[name = string("layers_23_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_93_cast_fp16 = conv(dilations = current_key_93_dilations_0, groups = current_key_93_groups_0, pad = current_key_93_pad_0, pad_type = current_key_93_pad_type_0, strides = current_key_93_strides_0, weight = layers_23_self_attn_k_proj_weight_to_fp16_palettized, x = obj_189_cast_fp16)[name = string("current_key_93_cast_fp16")];
+            string current_value_47_pad_type_0 = const()[name = string("current_value_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_47_strides_0 = const()[name = string("current_value_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_47_pad_0 = const()[name = string("current_value_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_47_dilations_0 = const()[name = string("current_value_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_47_groups_0 = const()[name = string("current_value_47_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_23_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1164307456))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1166404672))))[name = string("layers_23_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_47_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_47_dilations_0, groups = current_value_47_groups_0, pad = current_value_47_pad_0, pad_type = current_value_47_pad_type_0, strides = current_value_47_strides_0, weight = layers_23_self_attn_v_proj_weight_to_fp16_palettized, x = obj_189_cast_fp16)[name = string("current_value_47_cast_fp16")];
+            tensor<int32, [4]> var_9132 = const()[name = string("op_9132"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_185_cast_fp16 = reshape(shape = var_9132, x = query_139_cast_fp16)[name = string("inputs_185_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_187_cast_fp16 = mul(x = inputs_185_cast_fp16, y = inputs_185_cast_fp16)[name = string("inputs_sq_187_cast_fp16")];
+            tensor<int32, [1]> variance_187_axes_0 = const()[name = string("variance_187_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_187_keep_dims_0 = const()[name = string("variance_187_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_187_cast_fp16 = reduce_mean(axes = variance_187_axes_0, keep_dims = variance_187_keep_dims_0, x = inputs_sq_187_cast_fp16)[name = string("variance_187_cast_fp16")];
+            fp16 var_9138_to_fp16 = const()[name = string("op_9138_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_9139_cast_fp16 = add(x = variance_187_cast_fp16, y = var_9138_to_fp16)[name = string("op_9139_cast_fp16")];
+            fp32 var_9140_epsilon_0 = const()[name = string("op_9140_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_9140_cast_fp16 = rsqrt(epsilon = var_9140_epsilon_0, x = var_9139_cast_fp16)[name = string("op_9140_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_233_cast_fp16 = mul(x = inputs_185_cast_fp16, y = var_9140_cast_fp16)[name = string("hidden_states_233_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_187_to_fp16 = const()[name = string("w_187_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1166405248)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_47_cast_fp16 = mul(x = w_187_to_fp16, y = hidden_states_233_cast_fp16)[name = string("query_normed_47_cast_fp16")];
+            tensor<int32, [4]> var_9148 = const()[name = string("op_9148"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_187_cast_fp16 = reshape(shape = var_9148, x = current_key_93_cast_fp16)[name = string("inputs_187_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_189_cast_fp16 = mul(x = inputs_187_cast_fp16, y = inputs_187_cast_fp16)[name = string("inputs_sq_189_cast_fp16")];
+            tensor<int32, [1]> variance_189_axes_0 = const()[name = string("variance_189_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_189_keep_dims_0 = const()[name = string("variance_189_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_189_cast_fp16 = reduce_mean(axes = variance_189_axes_0, keep_dims = variance_189_keep_dims_0, x = inputs_sq_189_cast_fp16)[name = string("variance_189_cast_fp16")];
+            fp16 var_9154_to_fp16 = const()[name = string("op_9154_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_9155_cast_fp16 = add(x = variance_189_cast_fp16, y = var_9154_to_fp16)[name = string("op_9155_cast_fp16")];
+            fp32 var_9156_epsilon_0 = const()[name = string("op_9156_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_9156_cast_fp16 = rsqrt(epsilon = var_9156_epsilon_0, x = var_9155_cast_fp16)[name = string("op_9156_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_235_cast_fp16 = mul(x = inputs_187_cast_fp16, y = var_9156_cast_fp16)[name = string("hidden_states_235_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_189_to_fp16 = const()[name = string("w_189_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1166405568)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_47_cast_fp16 = mul(x = w_189_to_fp16, y = hidden_states_235_cast_fp16)[name = string("current_key_normed_47_cast_fp16")];
+            tensor<int32, [4]> var_9174 = const()[name = string("op_9174"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_139_cast_fp16 = reshape(shape = var_9174, x = query_normed_47_cast_fp16)[name = string("mh_q_139_cast_fp16")];
+            tensor<int32, [4]> var_9176 = const()[name = string("op_9176"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_93_cast_fp16 = reshape(shape = var_9176, x = current_key_normed_47_cast_fp16)[name = string("mh_k_93_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9180_cast_fp16 = mul(x = mh_q_139_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9180_cast_fp16")];
+            tensor<int32, [4]> var_9185_begin_0 = const()[name = string("op_9185_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9185_end_0 = const()[name = string("op_9185_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_9185_end_mask_0 = const()[name = string("op_9185_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9185_cast_fp16 = slice_by_index(begin = var_9185_begin_0, end = var_9185_end_0, end_mask = var_9185_end_mask_0, x = mh_q_139_cast_fp16)[name = string("op_9185_cast_fp16")];
+            tensor<int32, [4]> var_9191_begin_0 = const()[name = string("op_9191_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9191_end_0 = const()[name = string("op_9191_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_9191_end_mask_0 = const()[name = string("op_9191_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9191_cast_fp16 = slice_by_index(begin = var_9191_begin_0, end = var_9191_end_0, end_mask = var_9191_end_mask_0, x = mh_q_139_cast_fp16)[name = string("op_9191_cast_fp16")];
+            fp16 const_546_promoted_to_fp16 = const()[name = string("const_546_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_9193_cast_fp16 = mul(x = var_9191_cast_fp16, y = const_546_promoted_to_fp16)[name = string("op_9193_cast_fp16")];
+            bool var_9195_interleave_0 = const()[name = string("op_9195_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_9195_cast_fp16 = concat(axis = var_9073, interleave = var_9195_interleave_0, values = (var_9193_cast_fp16, var_9185_cast_fp16))[name = string("op_9195_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9196_cast_fp16 = mul(x = var_9195_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9196_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_141_cast_fp16 = add(x = var_9180_cast_fp16, y = var_9196_cast_fp16)[name = string("mh_q_141_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9198_cast_fp16 = mul(x = mh_k_93_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9198_cast_fp16")];
+            tensor<int32, [4]> var_9203_begin_0 = const()[name = string("op_9203_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9203_end_0 = const()[name = string("op_9203_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_9203_end_mask_0 = const()[name = string("op_9203_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9203_cast_fp16 = slice_by_index(begin = var_9203_begin_0, end = var_9203_end_0, end_mask = var_9203_end_mask_0, x = mh_k_93_cast_fp16)[name = string("op_9203_cast_fp16")];
+            tensor<int32, [4]> var_9209_begin_0 = const()[name = string("op_9209_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9209_end_0 = const()[name = string("op_9209_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_9209_end_mask_0 = const()[name = string("op_9209_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9209_cast_fp16 = slice_by_index(begin = var_9209_begin_0, end = var_9209_end_0, end_mask = var_9209_end_mask_0, x = mh_k_93_cast_fp16)[name = string("op_9209_cast_fp16")];
+            fp16 const_549_promoted_to_fp16 = const()[name = string("const_549_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_9211_cast_fp16 = mul(x = var_9209_cast_fp16, y = const_549_promoted_to_fp16)[name = string("op_9211_cast_fp16")];
+            bool var_9213_interleave_0 = const()[name = string("op_9213_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_9213_cast_fp16 = concat(axis = var_9073, interleave = var_9213_interleave_0, values = (var_9211_cast_fp16, var_9203_cast_fp16))[name = string("op_9213_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9214_cast_fp16 = mul(x = var_9213_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9214_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_95_cast_fp16 = add(x = var_9198_cast_fp16, y = var_9214_cast_fp16)[name = string("mh_k_95_cast_fp16")];
+            tensor<int32, [4]> var_9218 = const()[name = string("op_9218"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_95_cast_fp16 = reshape(shape = var_9218, x = mh_k_95_cast_fp16)[name = string("current_key_95_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9225_cast_fp16 = mul(x = var_101_cast_fp16_23, y = var_323_cast_fp16)[name = string("op_9225_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9226_cast_fp16 = mul(x = current_key_95_cast_fp16, y = var_321_cast_fp16)[name = string("op_9226_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_141_cast_fp16 = add(x = var_9225_cast_fp16, y = var_9226_cast_fp16)[name = string("key_141_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9229_cast_fp16 = mul(x = var_132_cast_fp16_23, y = var_323_cast_fp16)[name = string("op_9229_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9230_cast_fp16 = mul(x = current_value_47_cast_fp16, y = var_321_cast_fp16)[name = string("op_9230_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_93_cast_fp16 = add(x = var_9229_cast_fp16, y = var_9230_cast_fp16)[name = string("value_93_cast_fp16")];
+            tensor<int32, [4]> var_9234 = const()[name = string("op_9234"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_93_cast_fp16 = reshape(shape = var_9234, x = key_141_cast_fp16)[name = string("key_heads_93_cast_fp16")];
+            tensor<int32, [4]> var_9236 = const()[name = string("op_9236"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_93_cast_fp16 = reshape(shape = var_9236, x = value_93_cast_fp16)[name = string("value_heads_93_cast_fp16")];
+            tensor<int32, [4]> var_9239_begin_0 = const()[name = string("op_9239_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9239_end_0 = const()[name = string("op_9239_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9239_end_mask_0 = const()[name = string("op_9239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9239_cast_fp16 = slice_by_index(begin = var_9239_begin_0, end = var_9239_end_0, end_mask = var_9239_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9239_cast_fp16")];
+            tensor<int32, [4]> var_9243_begin_0 = const()[name = string("op_9243_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9243_end_0 = const()[name = string("op_9243_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9243_end_mask_0 = const()[name = string("op_9243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9243_cast_fp16 = slice_by_index(begin = var_9243_begin_0, end = var_9243_end_0, end_mask = var_9243_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9243_cast_fp16")];
+            tensor<int32, [4]> var_9255_begin_0 = const()[name = string("op_9255_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9255_end_0 = const()[name = string("op_9255_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9255_end_mask_0 = const()[name = string("op_9255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9255_cast_fp16 = slice_by_index(begin = var_9255_begin_0, end = var_9255_end_0, end_mask = var_9255_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9255_cast_fp16")];
+            tensor<int32, [4]> var_9259_begin_0 = const()[name = string("op_9259_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9259_end_0 = const()[name = string("op_9259_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9259_end_mask_0 = const()[name = string("op_9259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9259_cast_fp16 = slice_by_index(begin = var_9259_begin_0, end = var_9259_end_0, end_mask = var_9259_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9259_cast_fp16")];
+            tensor<int32, [4]> var_9271_begin_0 = const()[name = string("op_9271_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9271_end_0 = const()[name = string("op_9271_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9271_end_mask_0 = const()[name = string("op_9271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9271_cast_fp16 = slice_by_index(begin = var_9271_begin_0, end = var_9271_end_0, end_mask = var_9271_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9271_cast_fp16")];
+            tensor<int32, [4]> var_9275_begin_0 = const()[name = string("op_9275_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9275_end_0 = const()[name = string("op_9275_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9275_end_mask_0 = const()[name = string("op_9275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9275_cast_fp16 = slice_by_index(begin = var_9275_begin_0, end = var_9275_end_0, end_mask = var_9275_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9275_cast_fp16")];
+            tensor<int32, [4]> var_9287_begin_0 = const()[name = string("op_9287_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9287_end_0 = const()[name = string("op_9287_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9287_end_mask_0 = const()[name = string("op_9287_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9287_cast_fp16 = slice_by_index(begin = var_9287_begin_0, end = var_9287_end_0, end_mask = var_9287_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9287_cast_fp16")];
+            tensor<int32, [4]> var_9291_begin_0 = const()[name = string("op_9291_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9291_end_0 = const()[name = string("op_9291_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9291_end_mask_0 = const()[name = string("op_9291_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9291_cast_fp16 = slice_by_index(begin = var_9291_begin_0, end = var_9291_end_0, end_mask = var_9291_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9291_cast_fp16")];
+            tensor<int32, [4]> var_9303_begin_0 = const()[name = string("op_9303_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9303_end_0 = const()[name = string("op_9303_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9303_end_mask_0 = const()[name = string("op_9303_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9303_cast_fp16 = slice_by_index(begin = var_9303_begin_0, end = var_9303_end_0, end_mask = var_9303_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9303_cast_fp16")];
+            tensor<int32, [4]> var_9307_begin_0 = const()[name = string("op_9307_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9307_end_0 = const()[name = string("op_9307_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9307_end_mask_0 = const()[name = string("op_9307_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9307_cast_fp16 = slice_by_index(begin = var_9307_begin_0, end = var_9307_end_0, end_mask = var_9307_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9307_cast_fp16")];
+            tensor<int32, [4]> var_9319_begin_0 = const()[name = string("op_9319_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9319_end_0 = const()[name = string("op_9319_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9319_end_mask_0 = const()[name = string("op_9319_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9319_cast_fp16 = slice_by_index(begin = var_9319_begin_0, end = var_9319_end_0, end_mask = var_9319_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9319_cast_fp16")];
+            tensor<int32, [4]> var_9323_begin_0 = const()[name = string("op_9323_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9323_end_0 = const()[name = string("op_9323_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9323_end_mask_0 = const()[name = string("op_9323_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9323_cast_fp16 = slice_by_index(begin = var_9323_begin_0, end = var_9323_end_0, end_mask = var_9323_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9323_cast_fp16")];
+            tensor<int32, [4]> var_9335_begin_0 = const()[name = string("op_9335_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9335_end_0 = const()[name = string("op_9335_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9335_end_mask_0 = const()[name = string("op_9335_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9335_cast_fp16 = slice_by_index(begin = var_9335_begin_0, end = var_9335_end_0, end_mask = var_9335_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9335_cast_fp16")];
+            tensor<int32, [4]> var_9339_begin_0 = const()[name = string("op_9339_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9339_end_0 = const()[name = string("op_9339_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9339_end_mask_0 = const()[name = string("op_9339_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9339_cast_fp16 = slice_by_index(begin = var_9339_begin_0, end = var_9339_end_0, end_mask = var_9339_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9339_cast_fp16")];
+            tensor<int32, [4]> var_9351_begin_0 = const()[name = string("op_9351_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9351_end_0 = const()[name = string("op_9351_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9351_end_mask_0 = const()[name = string("op_9351_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9351_cast_fp16 = slice_by_index(begin = var_9351_begin_0, end = var_9351_end_0, end_mask = var_9351_end_mask_0, x = key_heads_93_cast_fp16)[name = string("op_9351_cast_fp16")];
+            tensor<int32, [4]> var_9355_begin_0 = const()[name = string("op_9355_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9355_end_0 = const()[name = string("op_9355_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9355_end_mask_0 = const()[name = string("op_9355_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9355_cast_fp16 = slice_by_index(begin = var_9355_begin_0, end = var_9355_end_0, end_mask = var_9355_end_mask_0, x = value_heads_93_cast_fp16)[name = string("op_9355_cast_fp16")];
+            bool key_heads_95_interleave_0 = const()[name = string("key_heads_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_95_cast_fp16 = concat(axis = var_9081, interleave = key_heads_95_interleave_0, values = (var_9239_cast_fp16, var_9239_cast_fp16, var_9255_cast_fp16, var_9255_cast_fp16, var_9271_cast_fp16, var_9271_cast_fp16, var_9287_cast_fp16, var_9287_cast_fp16, var_9303_cast_fp16, var_9303_cast_fp16, var_9319_cast_fp16, var_9319_cast_fp16, var_9335_cast_fp16, var_9335_cast_fp16, var_9351_cast_fp16, var_9351_cast_fp16))[name = string("key_heads_95_cast_fp16")];
+            bool value_heads_95_interleave_0 = const()[name = string("value_heads_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_95_cast_fp16 = concat(axis = var_9081, interleave = value_heads_95_interleave_0, values = (var_9243_cast_fp16, var_9243_cast_fp16, var_9259_cast_fp16, var_9259_cast_fp16, var_9275_cast_fp16, var_9275_cast_fp16, var_9291_cast_fp16, var_9291_cast_fp16, var_9307_cast_fp16, var_9307_cast_fp16, var_9323_cast_fp16, var_9323_cast_fp16, var_9339_cast_fp16, var_9339_cast_fp16, var_9355_cast_fp16, var_9355_cast_fp16))[name = string("value_heads_95_cast_fp16")];
+            fp16 var_9378_to_fp16 = const()[name = string("op_9378_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_9379_cast_fp16 = mul(x = mh_q_141_cast_fp16, y = var_9378_to_fp16)[name = string("op_9379_cast_fp16")];
+            bool mh_w_93_transpose_x_0 = const()[name = string("mh_w_93_transpose_x_0"), val = bool(true)];
+            bool mh_w_93_transpose_y_0 = const()[name = string("mh_w_93_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_93_cast_fp16 = matmul(transpose_x = mh_w_93_transpose_x_0, transpose_y = mh_w_93_transpose_y_0, x = var_9379_cast_fp16, y = key_heads_95_cast_fp16)[name = string("mh_w_93_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_95_cast_fp16 = add(x = mh_w_93_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_95_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_9391_cast_fp16 = softmax(axis = var_9063, x = mh_w_95_cast_fp16)[name = string("op_9391_cast_fp16")];
+            bool attn_47_transpose_x_0 = const()[name = string("attn_47_transpose_x_0"), val = bool(false)];
+            bool attn_47_transpose_y_0 = const()[name = string("attn_47_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_47_cast_fp16 = matmul(transpose_x = attn_47_transpose_x_0, transpose_y = attn_47_transpose_y_0, x = value_heads_95_cast_fp16, y = var_9391_cast_fp16)[name = string("attn_47_cast_fp16")];
+            tensor<int32, [4]> var_9396 = const()[name = string("op_9396"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_185_cast_fp16 = reshape(shape = var_9396, x = attn_47_cast_fp16)[name = string("input_185_cast_fp16")];
+            string obj_195_pad_type_0 = const()[name = string("obj_195_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_195_strides_0 = const()[name = string("obj_195_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_195_pad_0 = const()[name = string("obj_195_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_195_dilations_0 = const()[name = string("obj_195_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_195_groups_0 = const()[name = string("obj_195_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_23_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1166405888))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1170600256))))[name = string("layers_23_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_195_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_195_dilations_0, groups = obj_195_groups_0, pad = obj_195_pad_0, pad_type = obj_195_pad_type_0, strides = obj_195_strides_0, weight = layers_23_self_attn_o_proj_weight_to_fp16_palettized, x = input_185_cast_fp16)[name = string("obj_195_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_189_cast_fp16 = add(x = inputs_183_cast_fp16, y = obj_195_cast_fp16)[name = string("inputs_189_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_191_cast_fp16 = mul(x = inputs_189_cast_fp16, y = inputs_189_cast_fp16)[name = string("inputs_sq_191_cast_fp16")];
+            tensor<int32, [1]> variance_191_axes_0 = const()[name = string("variance_191_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_191_keep_dims_0 = const()[name = string("variance_191_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_191_cast_fp16 = reduce_mean(axes = variance_191_axes_0, keep_dims = variance_191_keep_dims_0, x = inputs_sq_191_cast_fp16)[name = string("variance_191_cast_fp16")];
+            fp16 var_9414_to_fp16 = const()[name = string("op_9414_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9415_cast_fp16 = add(x = variance_191_cast_fp16, y = var_9414_to_fp16)[name = string("op_9415_cast_fp16")];
+            fp32 var_9416_epsilon_0 = const()[name = string("op_9416_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9416_cast_fp16 = rsqrt(epsilon = var_9416_epsilon_0, x = var_9415_cast_fp16)[name = string("op_9416_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_237_cast_fp16 = mul(x = inputs_189_cast_fp16, y = var_9416_cast_fp16)[name = string("hidden_states_237_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_191_to_fp16 = const()[name = string("w_191_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1170600832)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_187_cast_fp16 = mul(x = w_191_to_fp16, y = hidden_states_237_cast_fp16)[name = string("input_187_cast_fp16")];
+            string input_189_pad_type_0 = const()[name = string("input_189_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_189_strides_0 = const()[name = string("input_189_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_189_pad_0 = const()[name = string("input_189_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_189_dilations_0 = const()[name = string("input_189_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_189_groups_0 = const()[name = string("input_189_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_23_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1170604992))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1183187968))))[name = string("layers_23_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_189_cast_fp16 = conv(dilations = input_189_dilations_0, groups = input_189_groups_0, pad = input_189_pad_0, pad_type = input_189_pad_type_0, strides = input_189_strides_0, weight = layers_23_mlp_gate_proj_weight_to_fp16_palettized, x = input_187_cast_fp16)[name = string("input_189_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_9430_cast_fp16 = silu(x = input_189_cast_fp16)[name = string("op_9430_cast_fp16")];
+            string var_9436_pad_type_0 = const()[name = string("op_9436_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9436_strides_0 = const()[name = string("op_9436_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9436_pad_0 = const()[name = string("op_9436_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9436_dilations_0 = const()[name = string("op_9436_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9436_groups_0 = const()[name = string("op_9436_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_23_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1183188544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1195771520))))[name = string("layers_23_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_9436_cast_fp16 = conv(dilations = var_9436_dilations_0, groups = var_9436_groups_0, pad = var_9436_pad_0, pad_type = var_9436_pad_type_0, strides = var_9436_strides_0, weight = layers_23_mlp_up_proj_weight_to_fp16_palettized, x = input_187_cast_fp16)[name = string("op_9436_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_191_cast_fp16 = mul(x = var_9430_cast_fp16, y = var_9436_cast_fp16)[name = string("input_191_cast_fp16")];
+            string hidden_states_239_pad_type_0 = const()[name = string("hidden_states_239_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_239_strides_0 = const()[name = string("hidden_states_239_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_239_pad_0 = const()[name = string("hidden_states_239_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_239_dilations_0 = const()[name = string("hidden_states_239_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_239_groups_0 = const()[name = string("hidden_states_239_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_23_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1195772096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1208355072))))[name = string("layers_23_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_239_cast_fp16 = conv(dilations = hidden_states_239_dilations_0, groups = hidden_states_239_groups_0, pad = hidden_states_239_pad_0, pad_type = hidden_states_239_pad_type_0, strides = hidden_states_239_strides_0, weight = layers_23_mlp_down_proj_weight_to_fp16_palettized, x = input_191_cast_fp16)[name = string("hidden_states_239_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_191_cast_fp16 = add(x = inputs_189_cast_fp16, y = hidden_states_239_cast_fp16)[name = string("inputs_191_cast_fp16")];
+            int32 var_9450 = const()[name = string("op_9450"), val = int32(3)];
+            int32 var_9460 = const()[name = string("op_9460"), val = int32(-2)];
+            int32 var_9468 = const()[name = string("op_9468"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_193_cast_fp16 = mul(x = inputs_191_cast_fp16, y = inputs_191_cast_fp16)[name = string("inputs_sq_193_cast_fp16")];
+            tensor<int32, [1]> variance_193_axes_0 = const()[name = string("variance_193_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_193_keep_dims_0 = const()[name = string("variance_193_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_193_cast_fp16 = reduce_mean(axes = variance_193_axes_0, keep_dims = variance_193_keep_dims_0, x = inputs_sq_193_cast_fp16)[name = string("variance_193_cast_fp16")];
+            fp16 var_9480_to_fp16 = const()[name = string("op_9480_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9481_cast_fp16 = add(x = variance_193_cast_fp16, y = var_9480_to_fp16)[name = string("op_9481_cast_fp16")];
+            fp32 var_9482_epsilon_0 = const()[name = string("op_9482_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9482_cast_fp16 = rsqrt(epsilon = var_9482_epsilon_0, x = var_9481_cast_fp16)[name = string("op_9482_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_241_cast_fp16 = mul(x = inputs_191_cast_fp16, y = var_9482_cast_fp16)[name = string("hidden_states_241_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_193_to_fp16 = const()[name = string("w_193_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1208355648)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_197_cast_fp16 = mul(x = w_193_to_fp16, y = hidden_states_241_cast_fp16)[name = string("obj_197_cast_fp16")];
+            string query_145_pad_type_0 = const()[name = string("query_145_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_145_strides_0 = const()[name = string("query_145_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_145_pad_0 = const()[name = string("query_145_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_145_dilations_0 = const()[name = string("query_145_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_145_groups_0 = const()[name = string("query_145_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_24_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1208359808))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1212554176))))[name = string("layers_24_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_145_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_145_dilations_0, groups = query_145_groups_0, pad = query_145_pad_0, pad_type = query_145_pad_type_0, strides = query_145_strides_0, weight = layers_24_self_attn_q_proj_weight_to_fp16_palettized, x = obj_197_cast_fp16)[name = string("query_145_cast_fp16")];
+            string current_key_97_pad_type_0 = const()[name = string("current_key_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_97_strides_0 = const()[name = string("current_key_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_97_pad_0 = const()[name = string("current_key_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_97_dilations_0 = const()[name = string("current_key_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_97_groups_0 = const()[name = string("current_key_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_24_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1212554752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1214651968))))[name = string("layers_24_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_97_cast_fp16 = conv(dilations = current_key_97_dilations_0, groups = current_key_97_groups_0, pad = current_key_97_pad_0, pad_type = current_key_97_pad_type_0, strides = current_key_97_strides_0, weight = layers_24_self_attn_k_proj_weight_to_fp16_palettized, x = obj_197_cast_fp16)[name = string("current_key_97_cast_fp16")];
+            string current_value_49_pad_type_0 = const()[name = string("current_value_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_49_strides_0 = const()[name = string("current_value_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_49_pad_0 = const()[name = string("current_value_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_49_dilations_0 = const()[name = string("current_value_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_49_groups_0 = const()[name = string("current_value_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_24_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1214652544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1216749760))))[name = string("layers_24_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_49_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_49_dilations_0, groups = current_value_49_groups_0, pad = current_value_49_pad_0, pad_type = current_value_49_pad_type_0, strides = current_value_49_strides_0, weight = layers_24_self_attn_v_proj_weight_to_fp16_palettized, x = obj_197_cast_fp16)[name = string("current_value_49_cast_fp16")];
+            tensor<int32, [4]> var_9519 = const()[name = string("op_9519"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_193_cast_fp16 = reshape(shape = var_9519, x = query_145_cast_fp16)[name = string("inputs_193_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_195_cast_fp16 = mul(x = inputs_193_cast_fp16, y = inputs_193_cast_fp16)[name = string("inputs_sq_195_cast_fp16")];
+            tensor<int32, [1]> variance_195_axes_0 = const()[name = string("variance_195_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_195_keep_dims_0 = const()[name = string("variance_195_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_195_cast_fp16 = reduce_mean(axes = variance_195_axes_0, keep_dims = variance_195_keep_dims_0, x = inputs_sq_195_cast_fp16)[name = string("variance_195_cast_fp16")];
+            fp16 var_9525_to_fp16 = const()[name = string("op_9525_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_9526_cast_fp16 = add(x = variance_195_cast_fp16, y = var_9525_to_fp16)[name = string("op_9526_cast_fp16")];
+            fp32 var_9527_epsilon_0 = const()[name = string("op_9527_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_9527_cast_fp16 = rsqrt(epsilon = var_9527_epsilon_0, x = var_9526_cast_fp16)[name = string("op_9527_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_243_cast_fp16 = mul(x = inputs_193_cast_fp16, y = var_9527_cast_fp16)[name = string("hidden_states_243_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_195_to_fp16 = const()[name = string("w_195_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1216750336)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_49_cast_fp16 = mul(x = w_195_to_fp16, y = hidden_states_243_cast_fp16)[name = string("query_normed_49_cast_fp16")];
+            tensor<int32, [4]> var_9535 = const()[name = string("op_9535"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_195_cast_fp16 = reshape(shape = var_9535, x = current_key_97_cast_fp16)[name = string("inputs_195_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_197_cast_fp16 = mul(x = inputs_195_cast_fp16, y = inputs_195_cast_fp16)[name = string("inputs_sq_197_cast_fp16")];
+            tensor<int32, [1]> variance_197_axes_0 = const()[name = string("variance_197_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_197_keep_dims_0 = const()[name = string("variance_197_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_197_cast_fp16 = reduce_mean(axes = variance_197_axes_0, keep_dims = variance_197_keep_dims_0, x = inputs_sq_197_cast_fp16)[name = string("variance_197_cast_fp16")];
+            fp16 var_9541_to_fp16 = const()[name = string("op_9541_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_9542_cast_fp16 = add(x = variance_197_cast_fp16, y = var_9541_to_fp16)[name = string("op_9542_cast_fp16")];
+            fp32 var_9543_epsilon_0 = const()[name = string("op_9543_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_9543_cast_fp16 = rsqrt(epsilon = var_9543_epsilon_0, x = var_9542_cast_fp16)[name = string("op_9543_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_245_cast_fp16 = mul(x = inputs_195_cast_fp16, y = var_9543_cast_fp16)[name = string("hidden_states_245_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_197_to_fp16 = const()[name = string("w_197_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1216750656)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_49_cast_fp16 = mul(x = w_197_to_fp16, y = hidden_states_245_cast_fp16)[name = string("current_key_normed_49_cast_fp16")];
+            tensor<int32, [4]> var_9561 = const()[name = string("op_9561"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_145_cast_fp16 = reshape(shape = var_9561, x = query_normed_49_cast_fp16)[name = string("mh_q_145_cast_fp16")];
+            tensor<int32, [4]> var_9563 = const()[name = string("op_9563"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_97_cast_fp16 = reshape(shape = var_9563, x = current_key_normed_49_cast_fp16)[name = string("mh_k_97_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9567_cast_fp16 = mul(x = mh_q_145_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9567_cast_fp16")];
+            tensor<int32, [4]> var_9572_begin_0 = const()[name = string("op_9572_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9572_end_0 = const()[name = string("op_9572_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_9572_end_mask_0 = const()[name = string("op_9572_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9572_cast_fp16 = slice_by_index(begin = var_9572_begin_0, end = var_9572_end_0, end_mask = var_9572_end_mask_0, x = mh_q_145_cast_fp16)[name = string("op_9572_cast_fp16")];
+            tensor<int32, [4]> var_9578_begin_0 = const()[name = string("op_9578_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9578_end_0 = const()[name = string("op_9578_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_9578_end_mask_0 = const()[name = string("op_9578_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9578_cast_fp16 = slice_by_index(begin = var_9578_begin_0, end = var_9578_end_0, end_mask = var_9578_end_mask_0, x = mh_q_145_cast_fp16)[name = string("op_9578_cast_fp16")];
+            fp16 const_569_promoted_to_fp16 = const()[name = string("const_569_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_9580_cast_fp16 = mul(x = var_9578_cast_fp16, y = const_569_promoted_to_fp16)[name = string("op_9580_cast_fp16")];
+            bool var_9582_interleave_0 = const()[name = string("op_9582_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_9582_cast_fp16 = concat(axis = var_9460, interleave = var_9582_interleave_0, values = (var_9580_cast_fp16, var_9572_cast_fp16))[name = string("op_9582_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9583_cast_fp16 = mul(x = var_9582_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9583_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_147_cast_fp16 = add(x = var_9567_cast_fp16, y = var_9583_cast_fp16)[name = string("mh_q_147_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9585_cast_fp16 = mul(x = mh_k_97_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9585_cast_fp16")];
+            tensor<int32, [4]> var_9590_begin_0 = const()[name = string("op_9590_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9590_end_0 = const()[name = string("op_9590_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_9590_end_mask_0 = const()[name = string("op_9590_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9590_cast_fp16 = slice_by_index(begin = var_9590_begin_0, end = var_9590_end_0, end_mask = var_9590_end_mask_0, x = mh_k_97_cast_fp16)[name = string("op_9590_cast_fp16")];
+            tensor<int32, [4]> var_9596_begin_0 = const()[name = string("op_9596_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9596_end_0 = const()[name = string("op_9596_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_9596_end_mask_0 = const()[name = string("op_9596_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9596_cast_fp16 = slice_by_index(begin = var_9596_begin_0, end = var_9596_end_0, end_mask = var_9596_end_mask_0, x = mh_k_97_cast_fp16)[name = string("op_9596_cast_fp16")];
+            fp16 const_572_promoted_to_fp16 = const()[name = string("const_572_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_9598_cast_fp16 = mul(x = var_9596_cast_fp16, y = const_572_promoted_to_fp16)[name = string("op_9598_cast_fp16")];
+            bool var_9600_interleave_0 = const()[name = string("op_9600_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_9600_cast_fp16 = concat(axis = var_9460, interleave = var_9600_interleave_0, values = (var_9598_cast_fp16, var_9590_cast_fp16))[name = string("op_9600_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9601_cast_fp16 = mul(x = var_9600_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9601_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_99_cast_fp16 = add(x = var_9585_cast_fp16, y = var_9601_cast_fp16)[name = string("mh_k_99_cast_fp16")];
+            tensor<int32, [4]> var_9605 = const()[name = string("op_9605"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_99_cast_fp16 = reshape(shape = var_9605, x = mh_k_99_cast_fp16)[name = string("current_key_99_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9612_cast_fp16 = mul(x = var_101_cast_fp16_24, y = var_323_cast_fp16)[name = string("op_9612_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9613_cast_fp16 = mul(x = current_key_99_cast_fp16, y = var_321_cast_fp16)[name = string("op_9613_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_147_cast_fp16 = add(x = var_9612_cast_fp16, y = var_9613_cast_fp16)[name = string("key_147_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9616_cast_fp16 = mul(x = var_132_cast_fp16_24, y = var_323_cast_fp16)[name = string("op_9616_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9617_cast_fp16 = mul(x = current_value_49_cast_fp16, y = var_321_cast_fp16)[name = string("op_9617_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_97_cast_fp16 = add(x = var_9616_cast_fp16, y = var_9617_cast_fp16)[name = string("value_97_cast_fp16")];
+            tensor<int32, [4]> var_9621 = const()[name = string("op_9621"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_97_cast_fp16 = reshape(shape = var_9621, x = key_147_cast_fp16)[name = string("key_heads_97_cast_fp16")];
+            tensor<int32, [4]> var_9623 = const()[name = string("op_9623"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_97_cast_fp16 = reshape(shape = var_9623, x = value_97_cast_fp16)[name = string("value_heads_97_cast_fp16")];
+            tensor<int32, [4]> var_9626_begin_0 = const()[name = string("op_9626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9626_end_0 = const()[name = string("op_9626_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9626_end_mask_0 = const()[name = string("op_9626_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9626_cast_fp16 = slice_by_index(begin = var_9626_begin_0, end = var_9626_end_0, end_mask = var_9626_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9626_cast_fp16")];
+            tensor<int32, [4]> var_9630_begin_0 = const()[name = string("op_9630_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9630_end_0 = const()[name = string("op_9630_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9630_end_mask_0 = const()[name = string("op_9630_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9630_cast_fp16 = slice_by_index(begin = var_9630_begin_0, end = var_9630_end_0, end_mask = var_9630_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9630_cast_fp16")];
+            tensor<int32, [4]> var_9642_begin_0 = const()[name = string("op_9642_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9642_end_0 = const()[name = string("op_9642_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9642_end_mask_0 = const()[name = string("op_9642_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9642_cast_fp16 = slice_by_index(begin = var_9642_begin_0, end = var_9642_end_0, end_mask = var_9642_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9642_cast_fp16")];
+            tensor<int32, [4]> var_9646_begin_0 = const()[name = string("op_9646_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_9646_end_0 = const()[name = string("op_9646_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_9646_end_mask_0 = const()[name = string("op_9646_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9646_cast_fp16 = slice_by_index(begin = var_9646_begin_0, end = var_9646_end_0, end_mask = var_9646_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9646_cast_fp16")];
+            tensor<int32, [4]> var_9658_begin_0 = const()[name = string("op_9658_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9658_end_0 = const()[name = string("op_9658_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9658_end_mask_0 = const()[name = string("op_9658_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9658_cast_fp16 = slice_by_index(begin = var_9658_begin_0, end = var_9658_end_0, end_mask = var_9658_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9658_cast_fp16")];
+            tensor<int32, [4]> var_9662_begin_0 = const()[name = string("op_9662_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_9662_end_0 = const()[name = string("op_9662_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_9662_end_mask_0 = const()[name = string("op_9662_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9662_cast_fp16 = slice_by_index(begin = var_9662_begin_0, end = var_9662_end_0, end_mask = var_9662_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9662_cast_fp16")];
+            tensor<int32, [4]> var_9674_begin_0 = const()[name = string("op_9674_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9674_end_0 = const()[name = string("op_9674_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9674_end_mask_0 = const()[name = string("op_9674_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9674_cast_fp16 = slice_by_index(begin = var_9674_begin_0, end = var_9674_end_0, end_mask = var_9674_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9674_cast_fp16")];
+            tensor<int32, [4]> var_9678_begin_0 = const()[name = string("op_9678_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_9678_end_0 = const()[name = string("op_9678_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_9678_end_mask_0 = const()[name = string("op_9678_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9678_cast_fp16 = slice_by_index(begin = var_9678_begin_0, end = var_9678_end_0, end_mask = var_9678_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9678_cast_fp16")];
+            tensor<int32, [4]> var_9690_begin_0 = const()[name = string("op_9690_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9690_end_0 = const()[name = string("op_9690_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9690_end_mask_0 = const()[name = string("op_9690_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9690_cast_fp16 = slice_by_index(begin = var_9690_begin_0, end = var_9690_end_0, end_mask = var_9690_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9690_cast_fp16")];
+            tensor<int32, [4]> var_9694_begin_0 = const()[name = string("op_9694_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_9694_end_0 = const()[name = string("op_9694_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_9694_end_mask_0 = const()[name = string("op_9694_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9694_cast_fp16 = slice_by_index(begin = var_9694_begin_0, end = var_9694_end_0, end_mask = var_9694_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9694_cast_fp16")];
+            tensor<int32, [4]> var_9706_begin_0 = const()[name = string("op_9706_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9706_end_0 = const()[name = string("op_9706_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9706_end_mask_0 = const()[name = string("op_9706_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9706_cast_fp16 = slice_by_index(begin = var_9706_begin_0, end = var_9706_end_0, end_mask = var_9706_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9706_cast_fp16")];
+            tensor<int32, [4]> var_9710_begin_0 = const()[name = string("op_9710_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_9710_end_0 = const()[name = string("op_9710_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_9710_end_mask_0 = const()[name = string("op_9710_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9710_cast_fp16 = slice_by_index(begin = var_9710_begin_0, end = var_9710_end_0, end_mask = var_9710_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9710_cast_fp16")];
+            tensor<int32, [4]> var_9722_begin_0 = const()[name = string("op_9722_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9722_end_0 = const()[name = string("op_9722_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9722_end_mask_0 = const()[name = string("op_9722_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9722_cast_fp16 = slice_by_index(begin = var_9722_begin_0, end = var_9722_end_0, end_mask = var_9722_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9722_cast_fp16")];
+            tensor<int32, [4]> var_9726_begin_0 = const()[name = string("op_9726_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_9726_end_0 = const()[name = string("op_9726_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_9726_end_mask_0 = const()[name = string("op_9726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9726_cast_fp16 = slice_by_index(begin = var_9726_begin_0, end = var_9726_end_0, end_mask = var_9726_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9726_cast_fp16")];
+            tensor<int32, [4]> var_9738_begin_0 = const()[name = string("op_9738_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9738_end_0 = const()[name = string("op_9738_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9738_end_mask_0 = const()[name = string("op_9738_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9738_cast_fp16 = slice_by_index(begin = var_9738_begin_0, end = var_9738_end_0, end_mask = var_9738_end_mask_0, x = key_heads_97_cast_fp16)[name = string("op_9738_cast_fp16")];
+            tensor<int32, [4]> var_9742_begin_0 = const()[name = string("op_9742_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_9742_end_0 = const()[name = string("op_9742_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_9742_end_mask_0 = const()[name = string("op_9742_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_9742_cast_fp16 = slice_by_index(begin = var_9742_begin_0, end = var_9742_end_0, end_mask = var_9742_end_mask_0, x = value_heads_97_cast_fp16)[name = string("op_9742_cast_fp16")];
+            bool key_heads_99_interleave_0 = const()[name = string("key_heads_99_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_99_cast_fp16 = concat(axis = var_9468, interleave = key_heads_99_interleave_0, values = (var_9626_cast_fp16, var_9626_cast_fp16, var_9642_cast_fp16, var_9642_cast_fp16, var_9658_cast_fp16, var_9658_cast_fp16, var_9674_cast_fp16, var_9674_cast_fp16, var_9690_cast_fp16, var_9690_cast_fp16, var_9706_cast_fp16, var_9706_cast_fp16, var_9722_cast_fp16, var_9722_cast_fp16, var_9738_cast_fp16, var_9738_cast_fp16))[name = string("key_heads_99_cast_fp16")];
+            bool value_heads_99_interleave_0 = const()[name = string("value_heads_99_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_99_cast_fp16 = concat(axis = var_9468, interleave = value_heads_99_interleave_0, values = (var_9630_cast_fp16, var_9630_cast_fp16, var_9646_cast_fp16, var_9646_cast_fp16, var_9662_cast_fp16, var_9662_cast_fp16, var_9678_cast_fp16, var_9678_cast_fp16, var_9694_cast_fp16, var_9694_cast_fp16, var_9710_cast_fp16, var_9710_cast_fp16, var_9726_cast_fp16, var_9726_cast_fp16, var_9742_cast_fp16, var_9742_cast_fp16))[name = string("value_heads_99_cast_fp16")];
+            fp16 var_9765_to_fp16 = const()[name = string("op_9765_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_9766_cast_fp16 = mul(x = mh_q_147_cast_fp16, y = var_9765_to_fp16)[name = string("op_9766_cast_fp16")];
+            bool mh_w_97_transpose_x_0 = const()[name = string("mh_w_97_transpose_x_0"), val = bool(true)];
+            bool mh_w_97_transpose_y_0 = const()[name = string("mh_w_97_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_97_cast_fp16 = matmul(transpose_x = mh_w_97_transpose_x_0, transpose_y = mh_w_97_transpose_y_0, x = var_9766_cast_fp16, y = key_heads_99_cast_fp16)[name = string("mh_w_97_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_99_cast_fp16 = add(x = mh_w_97_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_99_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_9778_cast_fp16 = softmax(axis = var_9450, x = mh_w_99_cast_fp16)[name = string("op_9778_cast_fp16")];
+            bool attn_49_transpose_x_0 = const()[name = string("attn_49_transpose_x_0"), val = bool(false)];
+            bool attn_49_transpose_y_0 = const()[name = string("attn_49_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_49_cast_fp16 = matmul(transpose_x = attn_49_transpose_x_0, transpose_y = attn_49_transpose_y_0, x = value_heads_99_cast_fp16, y = var_9778_cast_fp16)[name = string("attn_49_cast_fp16")];
+            tensor<int32, [4]> var_9783 = const()[name = string("op_9783"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_193_cast_fp16 = reshape(shape = var_9783, x = attn_49_cast_fp16)[name = string("input_193_cast_fp16")];
+            string obj_203_pad_type_0 = const()[name = string("obj_203_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_203_strides_0 = const()[name = string("obj_203_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_203_pad_0 = const()[name = string("obj_203_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_203_dilations_0 = const()[name = string("obj_203_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_203_groups_0 = const()[name = string("obj_203_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_24_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1216750976))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1220945344))))[name = string("layers_24_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_203_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_203_dilations_0, groups = obj_203_groups_0, pad = obj_203_pad_0, pad_type = obj_203_pad_type_0, strides = obj_203_strides_0, weight = layers_24_self_attn_o_proj_weight_to_fp16_palettized, x = input_193_cast_fp16)[name = string("obj_203_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_197_cast_fp16 = add(x = inputs_191_cast_fp16, y = obj_203_cast_fp16)[name = string("inputs_197_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_199_cast_fp16 = mul(x = inputs_197_cast_fp16, y = inputs_197_cast_fp16)[name = string("inputs_sq_199_cast_fp16")];
+            tensor<int32, [1]> variance_199_axes_0 = const()[name = string("variance_199_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_199_keep_dims_0 = const()[name = string("variance_199_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_199_cast_fp16 = reduce_mean(axes = variance_199_axes_0, keep_dims = variance_199_keep_dims_0, x = inputs_sq_199_cast_fp16)[name = string("variance_199_cast_fp16")];
+            fp16 var_9801_to_fp16 = const()[name = string("op_9801_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9802_cast_fp16 = add(x = variance_199_cast_fp16, y = var_9801_to_fp16)[name = string("op_9802_cast_fp16")];
+            fp32 var_9803_epsilon_0 = const()[name = string("op_9803_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9803_cast_fp16 = rsqrt(epsilon = var_9803_epsilon_0, x = var_9802_cast_fp16)[name = string("op_9803_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_247_cast_fp16 = mul(x = inputs_197_cast_fp16, y = var_9803_cast_fp16)[name = string("hidden_states_247_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_199_to_fp16 = const()[name = string("w_199_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1220945920)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_195_cast_fp16 = mul(x = w_199_to_fp16, y = hidden_states_247_cast_fp16)[name = string("input_195_cast_fp16")];
+            string input_197_pad_type_0 = const()[name = string("input_197_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_197_strides_0 = const()[name = string("input_197_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_197_pad_0 = const()[name = string("input_197_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_197_dilations_0 = const()[name = string("input_197_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_197_groups_0 = const()[name = string("input_197_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_24_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1220950080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1233533056))))[name = string("layers_24_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_197_cast_fp16 = conv(dilations = input_197_dilations_0, groups = input_197_groups_0, pad = input_197_pad_0, pad_type = input_197_pad_type_0, strides = input_197_strides_0, weight = layers_24_mlp_gate_proj_weight_to_fp16_palettized, x = input_195_cast_fp16)[name = string("input_197_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_9817_cast_fp16 = silu(x = input_197_cast_fp16)[name = string("op_9817_cast_fp16")];
+            string var_9823_pad_type_0 = const()[name = string("op_9823_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9823_strides_0 = const()[name = string("op_9823_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9823_pad_0 = const()[name = string("op_9823_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9823_dilations_0 = const()[name = string("op_9823_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9823_groups_0 = const()[name = string("op_9823_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_24_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1233533632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1246116608))))[name = string("layers_24_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_9823_cast_fp16 = conv(dilations = var_9823_dilations_0, groups = var_9823_groups_0, pad = var_9823_pad_0, pad_type = var_9823_pad_type_0, strides = var_9823_strides_0, weight = layers_24_mlp_up_proj_weight_to_fp16_palettized, x = input_195_cast_fp16)[name = string("op_9823_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_199_cast_fp16 = mul(x = var_9817_cast_fp16, y = var_9823_cast_fp16)[name = string("input_199_cast_fp16")];
+            string hidden_states_249_pad_type_0 = const()[name = string("hidden_states_249_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_249_strides_0 = const()[name = string("hidden_states_249_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_249_pad_0 = const()[name = string("hidden_states_249_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_249_dilations_0 = const()[name = string("hidden_states_249_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_249_groups_0 = const()[name = string("hidden_states_249_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_24_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1246117184))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1258700160))))[name = string("layers_24_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_249_cast_fp16 = conv(dilations = hidden_states_249_dilations_0, groups = hidden_states_249_groups_0, pad = hidden_states_249_pad_0, pad_type = hidden_states_249_pad_type_0, strides = hidden_states_249_strides_0, weight = layers_24_mlp_down_proj_weight_to_fp16_palettized, x = input_199_cast_fp16)[name = string("hidden_states_249_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_199_cast_fp16 = add(x = inputs_197_cast_fp16, y = hidden_states_249_cast_fp16)[name = string("inputs_199_cast_fp16")];
+            int32 var_9837 = const()[name = string("op_9837"), val = int32(3)];
+            int32 var_9847 = const()[name = string("op_9847"), val = int32(-2)];
+            int32 var_9855 = const()[name = string("op_9855"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_201_cast_fp16 = mul(x = inputs_199_cast_fp16, y = inputs_199_cast_fp16)[name = string("inputs_sq_201_cast_fp16")];
+            tensor<int32, [1]> variance_201_axes_0 = const()[name = string("variance_201_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_201_keep_dims_0 = const()[name = string("variance_201_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_201_cast_fp16 = reduce_mean(axes = variance_201_axes_0, keep_dims = variance_201_keep_dims_0, x = inputs_sq_201_cast_fp16)[name = string("variance_201_cast_fp16")];
+            fp16 var_9867_to_fp16 = const()[name = string("op_9867_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_9868_cast_fp16 = add(x = variance_201_cast_fp16, y = var_9867_to_fp16)[name = string("op_9868_cast_fp16")];
+            fp32 var_9869_epsilon_0 = const()[name = string("op_9869_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_9869_cast_fp16 = rsqrt(epsilon = var_9869_epsilon_0, x = var_9868_cast_fp16)[name = string("op_9869_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_251_cast_fp16 = mul(x = inputs_199_cast_fp16, y = var_9869_cast_fp16)[name = string("hidden_states_251_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_201_to_fp16 = const()[name = string("w_201_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1258700736)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_205_cast_fp16 = mul(x = w_201_to_fp16, y = hidden_states_251_cast_fp16)[name = string("obj_205_cast_fp16")];
+            string query_151_pad_type_0 = const()[name = string("query_151_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_151_strides_0 = const()[name = string("query_151_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_151_pad_0 = const()[name = string("query_151_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_151_dilations_0 = const()[name = string("query_151_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_151_groups_0 = const()[name = string("query_151_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_25_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1258704896))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1262899264))))[name = string("layers_25_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_151_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_151_dilations_0, groups = query_151_groups_0, pad = query_151_pad_0, pad_type = query_151_pad_type_0, strides = query_151_strides_0, weight = layers_25_self_attn_q_proj_weight_to_fp16_palettized, x = obj_205_cast_fp16)[name = string("query_151_cast_fp16")];
+            string current_key_101_pad_type_0 = const()[name = string("current_key_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_101_strides_0 = const()[name = string("current_key_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_101_pad_0 = const()[name = string("current_key_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_101_dilations_0 = const()[name = string("current_key_101_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_101_groups_0 = const()[name = string("current_key_101_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_25_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1262899840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1264997056))))[name = string("layers_25_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_101_cast_fp16 = conv(dilations = current_key_101_dilations_0, groups = current_key_101_groups_0, pad = current_key_101_pad_0, pad_type = current_key_101_pad_type_0, strides = current_key_101_strides_0, weight = layers_25_self_attn_k_proj_weight_to_fp16_palettized, x = obj_205_cast_fp16)[name = string("current_key_101_cast_fp16")];
+            string current_value_51_pad_type_0 = const()[name = string("current_value_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_51_strides_0 = const()[name = string("current_value_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_51_pad_0 = const()[name = string("current_value_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_51_dilations_0 = const()[name = string("current_value_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_51_groups_0 = const()[name = string("current_value_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_25_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1264997632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1267094848))))[name = string("layers_25_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_51_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_51_dilations_0, groups = current_value_51_groups_0, pad = current_value_51_pad_0, pad_type = current_value_51_pad_type_0, strides = current_value_51_strides_0, weight = layers_25_self_attn_v_proj_weight_to_fp16_palettized, x = obj_205_cast_fp16)[name = string("current_value_51_cast_fp16")];
+            tensor<int32, [4]> var_9906 = const()[name = string("op_9906"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_201_cast_fp16 = reshape(shape = var_9906, x = query_151_cast_fp16)[name = string("inputs_201_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_203_cast_fp16 = mul(x = inputs_201_cast_fp16, y = inputs_201_cast_fp16)[name = string("inputs_sq_203_cast_fp16")];
+            tensor<int32, [1]> variance_203_axes_0 = const()[name = string("variance_203_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_203_keep_dims_0 = const()[name = string("variance_203_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_203_cast_fp16 = reduce_mean(axes = variance_203_axes_0, keep_dims = variance_203_keep_dims_0, x = inputs_sq_203_cast_fp16)[name = string("variance_203_cast_fp16")];
+            fp16 var_9912_to_fp16 = const()[name = string("op_9912_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_9913_cast_fp16 = add(x = variance_203_cast_fp16, y = var_9912_to_fp16)[name = string("op_9913_cast_fp16")];
+            fp32 var_9914_epsilon_0 = const()[name = string("op_9914_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_9914_cast_fp16 = rsqrt(epsilon = var_9914_epsilon_0, x = var_9913_cast_fp16)[name = string("op_9914_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_253_cast_fp16 = mul(x = inputs_201_cast_fp16, y = var_9914_cast_fp16)[name = string("hidden_states_253_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_203_to_fp16 = const()[name = string("w_203_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1267095424)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_51_cast_fp16 = mul(x = w_203_to_fp16, y = hidden_states_253_cast_fp16)[name = string("query_normed_51_cast_fp16")];
+            tensor<int32, [4]> var_9922 = const()[name = string("op_9922"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_203_cast_fp16 = reshape(shape = var_9922, x = current_key_101_cast_fp16)[name = string("inputs_203_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_205_cast_fp16 = mul(x = inputs_203_cast_fp16, y = inputs_203_cast_fp16)[name = string("inputs_sq_205_cast_fp16")];
+            tensor<int32, [1]> variance_205_axes_0 = const()[name = string("variance_205_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_205_keep_dims_0 = const()[name = string("variance_205_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_205_cast_fp16 = reduce_mean(axes = variance_205_axes_0, keep_dims = variance_205_keep_dims_0, x = inputs_sq_205_cast_fp16)[name = string("variance_205_cast_fp16")];
+            fp16 var_9928_to_fp16 = const()[name = string("op_9928_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_9929_cast_fp16 = add(x = variance_205_cast_fp16, y = var_9928_to_fp16)[name = string("op_9929_cast_fp16")];
+            fp32 var_9930_epsilon_0 = const()[name = string("op_9930_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_9930_cast_fp16 = rsqrt(epsilon = var_9930_epsilon_0, x = var_9929_cast_fp16)[name = string("op_9930_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_255_cast_fp16 = mul(x = inputs_203_cast_fp16, y = var_9930_cast_fp16)[name = string("hidden_states_255_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_205_to_fp16 = const()[name = string("w_205_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1267095744)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_51_cast_fp16 = mul(x = w_205_to_fp16, y = hidden_states_255_cast_fp16)[name = string("current_key_normed_51_cast_fp16")];
+            tensor<int32, [4]> var_9948 = const()[name = string("op_9948"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_151_cast_fp16 = reshape(shape = var_9948, x = query_normed_51_cast_fp16)[name = string("mh_q_151_cast_fp16")];
+            tensor<int32, [4]> var_9950 = const()[name = string("op_9950"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_101_cast_fp16 = reshape(shape = var_9950, x = current_key_normed_51_cast_fp16)[name = string("mh_k_101_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9954_cast_fp16 = mul(x = mh_q_151_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9954_cast_fp16")];
+            tensor<int32, [4]> var_9959_begin_0 = const()[name = string("op_9959_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9959_end_0 = const()[name = string("op_9959_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_9959_end_mask_0 = const()[name = string("op_9959_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9959_cast_fp16 = slice_by_index(begin = var_9959_begin_0, end = var_9959_end_0, end_mask = var_9959_end_mask_0, x = mh_q_151_cast_fp16)[name = string("op_9959_cast_fp16")];
+            tensor<int32, [4]> var_9965_begin_0 = const()[name = string("op_9965_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9965_end_0 = const()[name = string("op_9965_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_9965_end_mask_0 = const()[name = string("op_9965_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_9965_cast_fp16 = slice_by_index(begin = var_9965_begin_0, end = var_9965_end_0, end_mask = var_9965_end_mask_0, x = mh_q_151_cast_fp16)[name = string("op_9965_cast_fp16")];
+            fp16 const_592_promoted_to_fp16 = const()[name = string("const_592_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_9967_cast_fp16 = mul(x = var_9965_cast_fp16, y = const_592_promoted_to_fp16)[name = string("op_9967_cast_fp16")];
+            bool var_9969_interleave_0 = const()[name = string("op_9969_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_9969_cast_fp16 = concat(axis = var_9847, interleave = var_9969_interleave_0, values = (var_9967_cast_fp16, var_9959_cast_fp16))[name = string("op_9969_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_9970_cast_fp16 = mul(x = var_9969_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9970_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_153_cast_fp16 = add(x = var_9954_cast_fp16, y = var_9970_cast_fp16)[name = string("mh_q_153_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9972_cast_fp16 = mul(x = mh_k_101_cast_fp16, y = cos_1_cast_fp16)[name = string("op_9972_cast_fp16")];
+            tensor<int32, [4]> var_9977_begin_0 = const()[name = string("op_9977_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_9977_end_0 = const()[name = string("op_9977_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_9977_end_mask_0 = const()[name = string("op_9977_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9977_cast_fp16 = slice_by_index(begin = var_9977_begin_0, end = var_9977_end_0, end_mask = var_9977_end_mask_0, x = mh_k_101_cast_fp16)[name = string("op_9977_cast_fp16")];
+            tensor<int32, [4]> var_9983_begin_0 = const()[name = string("op_9983_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_9983_end_0 = const()[name = string("op_9983_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_9983_end_mask_0 = const()[name = string("op_9983_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_9983_cast_fp16 = slice_by_index(begin = var_9983_begin_0, end = var_9983_end_0, end_mask = var_9983_end_mask_0, x = mh_k_101_cast_fp16)[name = string("op_9983_cast_fp16")];
+            fp16 const_595_promoted_to_fp16 = const()[name = string("const_595_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_9985_cast_fp16 = mul(x = var_9983_cast_fp16, y = const_595_promoted_to_fp16)[name = string("op_9985_cast_fp16")];
+            bool var_9987_interleave_0 = const()[name = string("op_9987_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_9987_cast_fp16 = concat(axis = var_9847, interleave = var_9987_interleave_0, values = (var_9985_cast_fp16, var_9977_cast_fp16))[name = string("op_9987_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_9988_cast_fp16 = mul(x = var_9987_cast_fp16, y = sin_1_cast_fp16)[name = string("op_9988_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_103_cast_fp16 = add(x = var_9972_cast_fp16, y = var_9988_cast_fp16)[name = string("mh_k_103_cast_fp16")];
+            tensor<int32, [4]> var_9992 = const()[name = string("op_9992"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_103_cast_fp16 = reshape(shape = var_9992, x = mh_k_103_cast_fp16)[name = string("current_key_103_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_9999_cast_fp16 = mul(x = var_101_cast_fp16_25, y = var_323_cast_fp16)[name = string("op_9999_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10000_cast_fp16 = mul(x = current_key_103_cast_fp16, y = var_321_cast_fp16)[name = string("op_10000_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_153_cast_fp16 = add(x = var_9999_cast_fp16, y = var_10000_cast_fp16)[name = string("key_153_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10003_cast_fp16 = mul(x = var_132_cast_fp16_25, y = var_323_cast_fp16)[name = string("op_10003_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10004_cast_fp16 = mul(x = current_value_51_cast_fp16, y = var_321_cast_fp16)[name = string("op_10004_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_101_cast_fp16 = add(x = var_10003_cast_fp16, y = var_10004_cast_fp16)[name = string("value_101_cast_fp16")];
+            tensor<int32, [4]> var_10008 = const()[name = string("op_10008"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_101_cast_fp16 = reshape(shape = var_10008, x = key_153_cast_fp16)[name = string("key_heads_101_cast_fp16")];
+            tensor<int32, [4]> var_10010 = const()[name = string("op_10010"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_101_cast_fp16 = reshape(shape = var_10010, x = value_101_cast_fp16)[name = string("value_heads_101_cast_fp16")];
+            tensor<int32, [4]> var_10013_begin_0 = const()[name = string("op_10013_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10013_end_0 = const()[name = string("op_10013_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10013_end_mask_0 = const()[name = string("op_10013_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10013_cast_fp16 = slice_by_index(begin = var_10013_begin_0, end = var_10013_end_0, end_mask = var_10013_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10013_cast_fp16")];
+            tensor<int32, [4]> var_10017_begin_0 = const()[name = string("op_10017_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10017_end_0 = const()[name = string("op_10017_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10017_end_mask_0 = const()[name = string("op_10017_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10017_cast_fp16 = slice_by_index(begin = var_10017_begin_0, end = var_10017_end_0, end_mask = var_10017_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10017_cast_fp16")];
+            tensor<int32, [4]> var_10029_begin_0 = const()[name = string("op_10029_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10029_end_0 = const()[name = string("op_10029_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10029_end_mask_0 = const()[name = string("op_10029_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10029_cast_fp16 = slice_by_index(begin = var_10029_begin_0, end = var_10029_end_0, end_mask = var_10029_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10029_cast_fp16")];
+            tensor<int32, [4]> var_10033_begin_0 = const()[name = string("op_10033_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10033_end_0 = const()[name = string("op_10033_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10033_end_mask_0 = const()[name = string("op_10033_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10033_cast_fp16 = slice_by_index(begin = var_10033_begin_0, end = var_10033_end_0, end_mask = var_10033_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10033_cast_fp16")];
+            tensor<int32, [4]> var_10045_begin_0 = const()[name = string("op_10045_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10045_end_0 = const()[name = string("op_10045_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10045_end_mask_0 = const()[name = string("op_10045_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10045_cast_fp16 = slice_by_index(begin = var_10045_begin_0, end = var_10045_end_0, end_mask = var_10045_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10045_cast_fp16")];
+            tensor<int32, [4]> var_10049_begin_0 = const()[name = string("op_10049_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10049_end_0 = const()[name = string("op_10049_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10049_end_mask_0 = const()[name = string("op_10049_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10049_cast_fp16 = slice_by_index(begin = var_10049_begin_0, end = var_10049_end_0, end_mask = var_10049_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10049_cast_fp16")];
+            tensor<int32, [4]> var_10061_begin_0 = const()[name = string("op_10061_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10061_end_0 = const()[name = string("op_10061_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10061_end_mask_0 = const()[name = string("op_10061_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10061_cast_fp16 = slice_by_index(begin = var_10061_begin_0, end = var_10061_end_0, end_mask = var_10061_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10061_cast_fp16")];
+            tensor<int32, [4]> var_10065_begin_0 = const()[name = string("op_10065_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10065_end_0 = const()[name = string("op_10065_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10065_end_mask_0 = const()[name = string("op_10065_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10065_cast_fp16 = slice_by_index(begin = var_10065_begin_0, end = var_10065_end_0, end_mask = var_10065_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10065_cast_fp16")];
+            tensor<int32, [4]> var_10077_begin_0 = const()[name = string("op_10077_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10077_end_0 = const()[name = string("op_10077_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10077_end_mask_0 = const()[name = string("op_10077_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10077_cast_fp16 = slice_by_index(begin = var_10077_begin_0, end = var_10077_end_0, end_mask = var_10077_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10077_cast_fp16")];
+            tensor<int32, [4]> var_10081_begin_0 = const()[name = string("op_10081_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10081_end_0 = const()[name = string("op_10081_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10081_end_mask_0 = const()[name = string("op_10081_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10081_cast_fp16 = slice_by_index(begin = var_10081_begin_0, end = var_10081_end_0, end_mask = var_10081_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10081_cast_fp16")];
+            tensor<int32, [4]> var_10093_begin_0 = const()[name = string("op_10093_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10093_end_0 = const()[name = string("op_10093_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10093_end_mask_0 = const()[name = string("op_10093_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10093_cast_fp16 = slice_by_index(begin = var_10093_begin_0, end = var_10093_end_0, end_mask = var_10093_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10093_cast_fp16")];
+            tensor<int32, [4]> var_10097_begin_0 = const()[name = string("op_10097_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10097_end_0 = const()[name = string("op_10097_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10097_end_mask_0 = const()[name = string("op_10097_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10097_cast_fp16 = slice_by_index(begin = var_10097_begin_0, end = var_10097_end_0, end_mask = var_10097_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10097_cast_fp16")];
+            tensor<int32, [4]> var_10109_begin_0 = const()[name = string("op_10109_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10109_end_0 = const()[name = string("op_10109_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10109_end_mask_0 = const()[name = string("op_10109_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10109_cast_fp16 = slice_by_index(begin = var_10109_begin_0, end = var_10109_end_0, end_mask = var_10109_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10109_cast_fp16")];
+            tensor<int32, [4]> var_10113_begin_0 = const()[name = string("op_10113_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10113_end_0 = const()[name = string("op_10113_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10113_end_mask_0 = const()[name = string("op_10113_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10113_cast_fp16 = slice_by_index(begin = var_10113_begin_0, end = var_10113_end_0, end_mask = var_10113_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10113_cast_fp16")];
+            tensor<int32, [4]> var_10125_begin_0 = const()[name = string("op_10125_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10125_end_0 = const()[name = string("op_10125_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10125_end_mask_0 = const()[name = string("op_10125_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10125_cast_fp16 = slice_by_index(begin = var_10125_begin_0, end = var_10125_end_0, end_mask = var_10125_end_mask_0, x = key_heads_101_cast_fp16)[name = string("op_10125_cast_fp16")];
+            tensor<int32, [4]> var_10129_begin_0 = const()[name = string("op_10129_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10129_end_0 = const()[name = string("op_10129_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10129_end_mask_0 = const()[name = string("op_10129_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10129_cast_fp16 = slice_by_index(begin = var_10129_begin_0, end = var_10129_end_0, end_mask = var_10129_end_mask_0, x = value_heads_101_cast_fp16)[name = string("op_10129_cast_fp16")];
+            bool key_heads_103_interleave_0 = const()[name = string("key_heads_103_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_103_cast_fp16 = concat(axis = var_9855, interleave = key_heads_103_interleave_0, values = (var_10013_cast_fp16, var_10013_cast_fp16, var_10029_cast_fp16, var_10029_cast_fp16, var_10045_cast_fp16, var_10045_cast_fp16, var_10061_cast_fp16, var_10061_cast_fp16, var_10077_cast_fp16, var_10077_cast_fp16, var_10093_cast_fp16, var_10093_cast_fp16, var_10109_cast_fp16, var_10109_cast_fp16, var_10125_cast_fp16, var_10125_cast_fp16))[name = string("key_heads_103_cast_fp16")];
+            bool value_heads_103_interleave_0 = const()[name = string("value_heads_103_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_103_cast_fp16 = concat(axis = var_9855, interleave = value_heads_103_interleave_0, values = (var_10017_cast_fp16, var_10017_cast_fp16, var_10033_cast_fp16, var_10033_cast_fp16, var_10049_cast_fp16, var_10049_cast_fp16, var_10065_cast_fp16, var_10065_cast_fp16, var_10081_cast_fp16, var_10081_cast_fp16, var_10097_cast_fp16, var_10097_cast_fp16, var_10113_cast_fp16, var_10113_cast_fp16, var_10129_cast_fp16, var_10129_cast_fp16))[name = string("value_heads_103_cast_fp16")];
+            fp16 var_10152_to_fp16 = const()[name = string("op_10152_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_10153_cast_fp16 = mul(x = mh_q_153_cast_fp16, y = var_10152_to_fp16)[name = string("op_10153_cast_fp16")];
+            bool mh_w_101_transpose_x_0 = const()[name = string("mh_w_101_transpose_x_0"), val = bool(true)];
+            bool mh_w_101_transpose_y_0 = const()[name = string("mh_w_101_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_101_cast_fp16 = matmul(transpose_x = mh_w_101_transpose_x_0, transpose_y = mh_w_101_transpose_y_0, x = var_10153_cast_fp16, y = key_heads_103_cast_fp16)[name = string("mh_w_101_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_103_cast_fp16 = add(x = mh_w_101_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_103_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_10165_cast_fp16 = softmax(axis = var_9837, x = mh_w_103_cast_fp16)[name = string("op_10165_cast_fp16")];
+            bool attn_51_transpose_x_0 = const()[name = string("attn_51_transpose_x_0"), val = bool(false)];
+            bool attn_51_transpose_y_0 = const()[name = string("attn_51_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_51_cast_fp16 = matmul(transpose_x = attn_51_transpose_x_0, transpose_y = attn_51_transpose_y_0, x = value_heads_103_cast_fp16, y = var_10165_cast_fp16)[name = string("attn_51_cast_fp16")];
+            tensor<int32, [4]> var_10170 = const()[name = string("op_10170"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_201_cast_fp16 = reshape(shape = var_10170, x = attn_51_cast_fp16)[name = string("input_201_cast_fp16")];
+            string obj_211_pad_type_0 = const()[name = string("obj_211_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_211_strides_0 = const()[name = string("obj_211_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_211_pad_0 = const()[name = string("obj_211_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_211_dilations_0 = const()[name = string("obj_211_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_211_groups_0 = const()[name = string("obj_211_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_25_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1267096064))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1271290432))))[name = string("layers_25_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_211_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_211_dilations_0, groups = obj_211_groups_0, pad = obj_211_pad_0, pad_type = obj_211_pad_type_0, strides = obj_211_strides_0, weight = layers_25_self_attn_o_proj_weight_to_fp16_palettized, x = input_201_cast_fp16)[name = string("obj_211_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_205_cast_fp16 = add(x = inputs_199_cast_fp16, y = obj_211_cast_fp16)[name = string("inputs_205_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_207_cast_fp16 = mul(x = inputs_205_cast_fp16, y = inputs_205_cast_fp16)[name = string("inputs_sq_207_cast_fp16")];
+            tensor<int32, [1]> variance_207_axes_0 = const()[name = string("variance_207_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_207_keep_dims_0 = const()[name = string("variance_207_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_207_cast_fp16 = reduce_mean(axes = variance_207_axes_0, keep_dims = variance_207_keep_dims_0, x = inputs_sq_207_cast_fp16)[name = string("variance_207_cast_fp16")];
+            fp16 var_10188_to_fp16 = const()[name = string("op_10188_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10189_cast_fp16 = add(x = variance_207_cast_fp16, y = var_10188_to_fp16)[name = string("op_10189_cast_fp16")];
+            fp32 var_10190_epsilon_0 = const()[name = string("op_10190_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10190_cast_fp16 = rsqrt(epsilon = var_10190_epsilon_0, x = var_10189_cast_fp16)[name = string("op_10190_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_257_cast_fp16 = mul(x = inputs_205_cast_fp16, y = var_10190_cast_fp16)[name = string("hidden_states_257_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_207_to_fp16 = const()[name = string("w_207_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1271291008)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_203_cast_fp16 = mul(x = w_207_to_fp16, y = hidden_states_257_cast_fp16)[name = string("input_203_cast_fp16")];
+            string input_205_pad_type_0 = const()[name = string("input_205_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_205_strides_0 = const()[name = string("input_205_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_205_pad_0 = const()[name = string("input_205_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_205_dilations_0 = const()[name = string("input_205_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_205_groups_0 = const()[name = string("input_205_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_25_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1271295168))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1283878144))))[name = string("layers_25_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_205_cast_fp16 = conv(dilations = input_205_dilations_0, groups = input_205_groups_0, pad = input_205_pad_0, pad_type = input_205_pad_type_0, strides = input_205_strides_0, weight = layers_25_mlp_gate_proj_weight_to_fp16_palettized, x = input_203_cast_fp16)[name = string("input_205_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_10204_cast_fp16 = silu(x = input_205_cast_fp16)[name = string("op_10204_cast_fp16")];
+            string var_10210_pad_type_0 = const()[name = string("op_10210_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10210_strides_0 = const()[name = string("op_10210_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10210_pad_0 = const()[name = string("op_10210_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10210_dilations_0 = const()[name = string("op_10210_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10210_groups_0 = const()[name = string("op_10210_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_25_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1283878720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1296461696))))[name = string("layers_25_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_10210_cast_fp16 = conv(dilations = var_10210_dilations_0, groups = var_10210_groups_0, pad = var_10210_pad_0, pad_type = var_10210_pad_type_0, strides = var_10210_strides_0, weight = layers_25_mlp_up_proj_weight_to_fp16_palettized, x = input_203_cast_fp16)[name = string("op_10210_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_207_cast_fp16 = mul(x = var_10204_cast_fp16, y = var_10210_cast_fp16)[name = string("input_207_cast_fp16")];
+            string hidden_states_259_pad_type_0 = const()[name = string("hidden_states_259_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_259_strides_0 = const()[name = string("hidden_states_259_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_259_pad_0 = const()[name = string("hidden_states_259_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_259_dilations_0 = const()[name = string("hidden_states_259_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_259_groups_0 = const()[name = string("hidden_states_259_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_25_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1296462272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1309045248))))[name = string("layers_25_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_259_cast_fp16 = conv(dilations = hidden_states_259_dilations_0, groups = hidden_states_259_groups_0, pad = hidden_states_259_pad_0, pad_type = hidden_states_259_pad_type_0, strides = hidden_states_259_strides_0, weight = layers_25_mlp_down_proj_weight_to_fp16_palettized, x = input_207_cast_fp16)[name = string("hidden_states_259_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_207_cast_fp16 = add(x = inputs_205_cast_fp16, y = hidden_states_259_cast_fp16)[name = string("inputs_207_cast_fp16")];
+            int32 var_10224 = const()[name = string("op_10224"), val = int32(3)];
+            int32 var_10234 = const()[name = string("op_10234"), val = int32(-2)];
+            int32 var_10242 = const()[name = string("op_10242"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_209_cast_fp16 = mul(x = inputs_207_cast_fp16, y = inputs_207_cast_fp16)[name = string("inputs_sq_209_cast_fp16")];
+            tensor<int32, [1]> variance_209_axes_0 = const()[name = string("variance_209_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_209_keep_dims_0 = const()[name = string("variance_209_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_209_cast_fp16 = reduce_mean(axes = variance_209_axes_0, keep_dims = variance_209_keep_dims_0, x = inputs_sq_209_cast_fp16)[name = string("variance_209_cast_fp16")];
+            fp16 var_10254_to_fp16 = const()[name = string("op_10254_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10255_cast_fp16 = add(x = variance_209_cast_fp16, y = var_10254_to_fp16)[name = string("op_10255_cast_fp16")];
+            fp32 var_10256_epsilon_0 = const()[name = string("op_10256_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10256_cast_fp16 = rsqrt(epsilon = var_10256_epsilon_0, x = var_10255_cast_fp16)[name = string("op_10256_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_261_cast_fp16 = mul(x = inputs_207_cast_fp16, y = var_10256_cast_fp16)[name = string("hidden_states_261_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_209_to_fp16 = const()[name = string("w_209_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1309045824)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_213_cast_fp16 = mul(x = w_209_to_fp16, y = hidden_states_261_cast_fp16)[name = string("obj_213_cast_fp16")];
+            string query_157_pad_type_0 = const()[name = string("query_157_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_157_strides_0 = const()[name = string("query_157_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_157_pad_0 = const()[name = string("query_157_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_157_dilations_0 = const()[name = string("query_157_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_157_groups_0 = const()[name = string("query_157_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_26_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1309049984))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1313244352))))[name = string("layers_26_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_157_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_157_dilations_0, groups = query_157_groups_0, pad = query_157_pad_0, pad_type = query_157_pad_type_0, strides = query_157_strides_0, weight = layers_26_self_attn_q_proj_weight_to_fp16_palettized, x = obj_213_cast_fp16)[name = string("query_157_cast_fp16")];
+            string current_key_105_pad_type_0 = const()[name = string("current_key_105_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_105_strides_0 = const()[name = string("current_key_105_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_105_pad_0 = const()[name = string("current_key_105_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_105_dilations_0 = const()[name = string("current_key_105_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_105_groups_0 = const()[name = string("current_key_105_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_26_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1313244928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1315342144))))[name = string("layers_26_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_105_cast_fp16 = conv(dilations = current_key_105_dilations_0, groups = current_key_105_groups_0, pad = current_key_105_pad_0, pad_type = current_key_105_pad_type_0, strides = current_key_105_strides_0, weight = layers_26_self_attn_k_proj_weight_to_fp16_palettized, x = obj_213_cast_fp16)[name = string("current_key_105_cast_fp16")];
+            string current_value_53_pad_type_0 = const()[name = string("current_value_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_53_strides_0 = const()[name = string("current_value_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_53_pad_0 = const()[name = string("current_value_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_53_dilations_0 = const()[name = string("current_value_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_53_groups_0 = const()[name = string("current_value_53_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_26_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1315342720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1317439936))))[name = string("layers_26_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_53_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_53_dilations_0, groups = current_value_53_groups_0, pad = current_value_53_pad_0, pad_type = current_value_53_pad_type_0, strides = current_value_53_strides_0, weight = layers_26_self_attn_v_proj_weight_to_fp16_palettized, x = obj_213_cast_fp16)[name = string("current_value_53_cast_fp16")];
+            tensor<int32, [4]> var_10293 = const()[name = string("op_10293"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_209_cast_fp16 = reshape(shape = var_10293, x = query_157_cast_fp16)[name = string("inputs_209_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_211_cast_fp16 = mul(x = inputs_209_cast_fp16, y = inputs_209_cast_fp16)[name = string("inputs_sq_211_cast_fp16")];
+            tensor<int32, [1]> variance_211_axes_0 = const()[name = string("variance_211_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_211_keep_dims_0 = const()[name = string("variance_211_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_211_cast_fp16 = reduce_mean(axes = variance_211_axes_0, keep_dims = variance_211_keep_dims_0, x = inputs_sq_211_cast_fp16)[name = string("variance_211_cast_fp16")];
+            fp16 var_10299_to_fp16 = const()[name = string("op_10299_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_10300_cast_fp16 = add(x = variance_211_cast_fp16, y = var_10299_to_fp16)[name = string("op_10300_cast_fp16")];
+            fp32 var_10301_epsilon_0 = const()[name = string("op_10301_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_10301_cast_fp16 = rsqrt(epsilon = var_10301_epsilon_0, x = var_10300_cast_fp16)[name = string("op_10301_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_263_cast_fp16 = mul(x = inputs_209_cast_fp16, y = var_10301_cast_fp16)[name = string("hidden_states_263_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_211_to_fp16 = const()[name = string("w_211_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1317440512)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_53_cast_fp16 = mul(x = w_211_to_fp16, y = hidden_states_263_cast_fp16)[name = string("query_normed_53_cast_fp16")];
+            tensor<int32, [4]> var_10309 = const()[name = string("op_10309"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_211_cast_fp16 = reshape(shape = var_10309, x = current_key_105_cast_fp16)[name = string("inputs_211_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_213_cast_fp16 = mul(x = inputs_211_cast_fp16, y = inputs_211_cast_fp16)[name = string("inputs_sq_213_cast_fp16")];
+            tensor<int32, [1]> variance_213_axes_0 = const()[name = string("variance_213_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_213_keep_dims_0 = const()[name = string("variance_213_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_213_cast_fp16 = reduce_mean(axes = variance_213_axes_0, keep_dims = variance_213_keep_dims_0, x = inputs_sq_213_cast_fp16)[name = string("variance_213_cast_fp16")];
+            fp16 var_10315_to_fp16 = const()[name = string("op_10315_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_10316_cast_fp16 = add(x = variance_213_cast_fp16, y = var_10315_to_fp16)[name = string("op_10316_cast_fp16")];
+            fp32 var_10317_epsilon_0 = const()[name = string("op_10317_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_10317_cast_fp16 = rsqrt(epsilon = var_10317_epsilon_0, x = var_10316_cast_fp16)[name = string("op_10317_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_265_cast_fp16 = mul(x = inputs_211_cast_fp16, y = var_10317_cast_fp16)[name = string("hidden_states_265_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_213_to_fp16 = const()[name = string("w_213_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1317440832)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_53_cast_fp16 = mul(x = w_213_to_fp16, y = hidden_states_265_cast_fp16)[name = string("current_key_normed_53_cast_fp16")];
+            tensor<int32, [4]> var_10335 = const()[name = string("op_10335"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_157_cast_fp16 = reshape(shape = var_10335, x = query_normed_53_cast_fp16)[name = string("mh_q_157_cast_fp16")];
+            tensor<int32, [4]> var_10337 = const()[name = string("op_10337"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_105_cast_fp16 = reshape(shape = var_10337, x = current_key_normed_53_cast_fp16)[name = string("mh_k_105_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10341_cast_fp16 = mul(x = mh_q_157_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10341_cast_fp16")];
+            tensor<int32, [4]> var_10346_begin_0 = const()[name = string("op_10346_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10346_end_0 = const()[name = string("op_10346_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_10346_end_mask_0 = const()[name = string("op_10346_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10346_cast_fp16 = slice_by_index(begin = var_10346_begin_0, end = var_10346_end_0, end_mask = var_10346_end_mask_0, x = mh_q_157_cast_fp16)[name = string("op_10346_cast_fp16")];
+            tensor<int32, [4]> var_10352_begin_0 = const()[name = string("op_10352_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10352_end_0 = const()[name = string("op_10352_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_10352_end_mask_0 = const()[name = string("op_10352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10352_cast_fp16 = slice_by_index(begin = var_10352_begin_0, end = var_10352_end_0, end_mask = var_10352_end_mask_0, x = mh_q_157_cast_fp16)[name = string("op_10352_cast_fp16")];
+            fp16 const_615_promoted_to_fp16 = const()[name = string("const_615_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_10354_cast_fp16 = mul(x = var_10352_cast_fp16, y = const_615_promoted_to_fp16)[name = string("op_10354_cast_fp16")];
+            bool var_10356_interleave_0 = const()[name = string("op_10356_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_10356_cast_fp16 = concat(axis = var_10234, interleave = var_10356_interleave_0, values = (var_10354_cast_fp16, var_10346_cast_fp16))[name = string("op_10356_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10357_cast_fp16 = mul(x = var_10356_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10357_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_159_cast_fp16 = add(x = var_10341_cast_fp16, y = var_10357_cast_fp16)[name = string("mh_q_159_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10359_cast_fp16 = mul(x = mh_k_105_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10359_cast_fp16")];
+            tensor<int32, [4]> var_10364_begin_0 = const()[name = string("op_10364_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10364_end_0 = const()[name = string("op_10364_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_10364_end_mask_0 = const()[name = string("op_10364_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10364_cast_fp16 = slice_by_index(begin = var_10364_begin_0, end = var_10364_end_0, end_mask = var_10364_end_mask_0, x = mh_k_105_cast_fp16)[name = string("op_10364_cast_fp16")];
+            tensor<int32, [4]> var_10370_begin_0 = const()[name = string("op_10370_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10370_end_0 = const()[name = string("op_10370_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_10370_end_mask_0 = const()[name = string("op_10370_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10370_cast_fp16 = slice_by_index(begin = var_10370_begin_0, end = var_10370_end_0, end_mask = var_10370_end_mask_0, x = mh_k_105_cast_fp16)[name = string("op_10370_cast_fp16")];
+            fp16 const_618_promoted_to_fp16 = const()[name = string("const_618_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_10372_cast_fp16 = mul(x = var_10370_cast_fp16, y = const_618_promoted_to_fp16)[name = string("op_10372_cast_fp16")];
+            bool var_10374_interleave_0 = const()[name = string("op_10374_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_10374_cast_fp16 = concat(axis = var_10234, interleave = var_10374_interleave_0, values = (var_10372_cast_fp16, var_10364_cast_fp16))[name = string("op_10374_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10375_cast_fp16 = mul(x = var_10374_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10375_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_107_cast_fp16 = add(x = var_10359_cast_fp16, y = var_10375_cast_fp16)[name = string("mh_k_107_cast_fp16")];
+            tensor<int32, [4]> var_10379 = const()[name = string("op_10379"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_107_cast_fp16 = reshape(shape = var_10379, x = mh_k_107_cast_fp16)[name = string("current_key_107_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10386_cast_fp16 = mul(x = var_101_cast_fp16_26, y = var_323_cast_fp16)[name = string("op_10386_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10387_cast_fp16 = mul(x = current_key_107_cast_fp16, y = var_321_cast_fp16)[name = string("op_10387_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_159_cast_fp16 = add(x = var_10386_cast_fp16, y = var_10387_cast_fp16)[name = string("key_159_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10390_cast_fp16 = mul(x = var_132_cast_fp16_26, y = var_323_cast_fp16)[name = string("op_10390_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10391_cast_fp16 = mul(x = current_value_53_cast_fp16, y = var_321_cast_fp16)[name = string("op_10391_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_105_cast_fp16 = add(x = var_10390_cast_fp16, y = var_10391_cast_fp16)[name = string("value_105_cast_fp16")];
+            tensor<int32, [4]> var_10395 = const()[name = string("op_10395"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_105_cast_fp16 = reshape(shape = var_10395, x = key_159_cast_fp16)[name = string("key_heads_105_cast_fp16")];
+            tensor<int32, [4]> var_10397 = const()[name = string("op_10397"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_105_cast_fp16 = reshape(shape = var_10397, x = value_105_cast_fp16)[name = string("value_heads_105_cast_fp16")];
+            tensor<int32, [4]> var_10400_begin_0 = const()[name = string("op_10400_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10400_end_0 = const()[name = string("op_10400_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10400_end_mask_0 = const()[name = string("op_10400_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10400_cast_fp16 = slice_by_index(begin = var_10400_begin_0, end = var_10400_end_0, end_mask = var_10400_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10400_cast_fp16")];
+            tensor<int32, [4]> var_10404_begin_0 = const()[name = string("op_10404_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10404_end_0 = const()[name = string("op_10404_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10404_end_mask_0 = const()[name = string("op_10404_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10404_cast_fp16 = slice_by_index(begin = var_10404_begin_0, end = var_10404_end_0, end_mask = var_10404_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10404_cast_fp16")];
+            tensor<int32, [4]> var_10416_begin_0 = const()[name = string("op_10416_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10416_end_0 = const()[name = string("op_10416_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10416_end_mask_0 = const()[name = string("op_10416_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10416_cast_fp16 = slice_by_index(begin = var_10416_begin_0, end = var_10416_end_0, end_mask = var_10416_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10416_cast_fp16")];
+            tensor<int32, [4]> var_10420_begin_0 = const()[name = string("op_10420_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10420_end_0 = const()[name = string("op_10420_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10420_end_mask_0 = const()[name = string("op_10420_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10420_cast_fp16 = slice_by_index(begin = var_10420_begin_0, end = var_10420_end_0, end_mask = var_10420_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10420_cast_fp16")];
+            tensor<int32, [4]> var_10432_begin_0 = const()[name = string("op_10432_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10432_end_0 = const()[name = string("op_10432_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10432_end_mask_0 = const()[name = string("op_10432_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10432_cast_fp16 = slice_by_index(begin = var_10432_begin_0, end = var_10432_end_0, end_mask = var_10432_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10432_cast_fp16")];
+            tensor<int32, [4]> var_10436_begin_0 = const()[name = string("op_10436_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10436_end_0 = const()[name = string("op_10436_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10436_end_mask_0 = const()[name = string("op_10436_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10436_cast_fp16 = slice_by_index(begin = var_10436_begin_0, end = var_10436_end_0, end_mask = var_10436_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10436_cast_fp16")];
+            tensor<int32, [4]> var_10448_begin_0 = const()[name = string("op_10448_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10448_end_0 = const()[name = string("op_10448_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10448_end_mask_0 = const()[name = string("op_10448_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10448_cast_fp16 = slice_by_index(begin = var_10448_begin_0, end = var_10448_end_0, end_mask = var_10448_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10448_cast_fp16")];
+            tensor<int32, [4]> var_10452_begin_0 = const()[name = string("op_10452_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10452_end_0 = const()[name = string("op_10452_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10452_end_mask_0 = const()[name = string("op_10452_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10452_cast_fp16 = slice_by_index(begin = var_10452_begin_0, end = var_10452_end_0, end_mask = var_10452_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10452_cast_fp16")];
+            tensor<int32, [4]> var_10464_begin_0 = const()[name = string("op_10464_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10464_end_0 = const()[name = string("op_10464_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10464_end_mask_0 = const()[name = string("op_10464_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10464_cast_fp16 = slice_by_index(begin = var_10464_begin_0, end = var_10464_end_0, end_mask = var_10464_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10464_cast_fp16")];
+            tensor<int32, [4]> var_10468_begin_0 = const()[name = string("op_10468_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10468_end_0 = const()[name = string("op_10468_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10468_end_mask_0 = const()[name = string("op_10468_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10468_cast_fp16 = slice_by_index(begin = var_10468_begin_0, end = var_10468_end_0, end_mask = var_10468_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10468_cast_fp16")];
+            tensor<int32, [4]> var_10480_begin_0 = const()[name = string("op_10480_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10480_end_0 = const()[name = string("op_10480_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10480_end_mask_0 = const()[name = string("op_10480_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10480_cast_fp16 = slice_by_index(begin = var_10480_begin_0, end = var_10480_end_0, end_mask = var_10480_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10480_cast_fp16")];
+            tensor<int32, [4]> var_10484_begin_0 = const()[name = string("op_10484_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10484_end_0 = const()[name = string("op_10484_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10484_end_mask_0 = const()[name = string("op_10484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10484_cast_fp16 = slice_by_index(begin = var_10484_begin_0, end = var_10484_end_0, end_mask = var_10484_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10484_cast_fp16")];
+            tensor<int32, [4]> var_10496_begin_0 = const()[name = string("op_10496_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10496_end_0 = const()[name = string("op_10496_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10496_end_mask_0 = const()[name = string("op_10496_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10496_cast_fp16 = slice_by_index(begin = var_10496_begin_0, end = var_10496_end_0, end_mask = var_10496_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10496_cast_fp16")];
+            tensor<int32, [4]> var_10500_begin_0 = const()[name = string("op_10500_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10500_end_0 = const()[name = string("op_10500_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10500_end_mask_0 = const()[name = string("op_10500_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10500_cast_fp16 = slice_by_index(begin = var_10500_begin_0, end = var_10500_end_0, end_mask = var_10500_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10500_cast_fp16")];
+            tensor<int32, [4]> var_10512_begin_0 = const()[name = string("op_10512_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10512_end_0 = const()[name = string("op_10512_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10512_end_mask_0 = const()[name = string("op_10512_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10512_cast_fp16 = slice_by_index(begin = var_10512_begin_0, end = var_10512_end_0, end_mask = var_10512_end_mask_0, x = key_heads_105_cast_fp16)[name = string("op_10512_cast_fp16")];
+            tensor<int32, [4]> var_10516_begin_0 = const()[name = string("op_10516_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10516_end_0 = const()[name = string("op_10516_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10516_end_mask_0 = const()[name = string("op_10516_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10516_cast_fp16 = slice_by_index(begin = var_10516_begin_0, end = var_10516_end_0, end_mask = var_10516_end_mask_0, x = value_heads_105_cast_fp16)[name = string("op_10516_cast_fp16")];
+            bool key_heads_107_interleave_0 = const()[name = string("key_heads_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_107_cast_fp16 = concat(axis = var_10242, interleave = key_heads_107_interleave_0, values = (var_10400_cast_fp16, var_10400_cast_fp16, var_10416_cast_fp16, var_10416_cast_fp16, var_10432_cast_fp16, var_10432_cast_fp16, var_10448_cast_fp16, var_10448_cast_fp16, var_10464_cast_fp16, var_10464_cast_fp16, var_10480_cast_fp16, var_10480_cast_fp16, var_10496_cast_fp16, var_10496_cast_fp16, var_10512_cast_fp16, var_10512_cast_fp16))[name = string("key_heads_107_cast_fp16")];
+            bool value_heads_107_interleave_0 = const()[name = string("value_heads_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_107_cast_fp16 = concat(axis = var_10242, interleave = value_heads_107_interleave_0, values = (var_10404_cast_fp16, var_10404_cast_fp16, var_10420_cast_fp16, var_10420_cast_fp16, var_10436_cast_fp16, var_10436_cast_fp16, var_10452_cast_fp16, var_10452_cast_fp16, var_10468_cast_fp16, var_10468_cast_fp16, var_10484_cast_fp16, var_10484_cast_fp16, var_10500_cast_fp16, var_10500_cast_fp16, var_10516_cast_fp16, var_10516_cast_fp16))[name = string("value_heads_107_cast_fp16")];
+            fp16 var_10539_to_fp16 = const()[name = string("op_10539_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_10540_cast_fp16 = mul(x = mh_q_159_cast_fp16, y = var_10539_to_fp16)[name = string("op_10540_cast_fp16")];
+            bool mh_w_105_transpose_x_0 = const()[name = string("mh_w_105_transpose_x_0"), val = bool(true)];
+            bool mh_w_105_transpose_y_0 = const()[name = string("mh_w_105_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_105_cast_fp16 = matmul(transpose_x = mh_w_105_transpose_x_0, transpose_y = mh_w_105_transpose_y_0, x = var_10540_cast_fp16, y = key_heads_107_cast_fp16)[name = string("mh_w_105_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_107_cast_fp16 = add(x = mh_w_105_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_107_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_10552_cast_fp16 = softmax(axis = var_10224, x = mh_w_107_cast_fp16)[name = string("op_10552_cast_fp16")];
+            bool attn_53_transpose_x_0 = const()[name = string("attn_53_transpose_x_0"), val = bool(false)];
+            bool attn_53_transpose_y_0 = const()[name = string("attn_53_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_53_cast_fp16 = matmul(transpose_x = attn_53_transpose_x_0, transpose_y = attn_53_transpose_y_0, x = value_heads_107_cast_fp16, y = var_10552_cast_fp16)[name = string("attn_53_cast_fp16")];
+            tensor<int32, [4]> var_10557 = const()[name = string("op_10557"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_209_cast_fp16 = reshape(shape = var_10557, x = attn_53_cast_fp16)[name = string("input_209_cast_fp16")];
+            string obj_219_pad_type_0 = const()[name = string("obj_219_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_219_strides_0 = const()[name = string("obj_219_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_219_pad_0 = const()[name = string("obj_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_219_dilations_0 = const()[name = string("obj_219_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_219_groups_0 = const()[name = string("obj_219_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_26_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1317441152))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1321635520))))[name = string("layers_26_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_219_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_219_dilations_0, groups = obj_219_groups_0, pad = obj_219_pad_0, pad_type = obj_219_pad_type_0, strides = obj_219_strides_0, weight = layers_26_self_attn_o_proj_weight_to_fp16_palettized, x = input_209_cast_fp16)[name = string("obj_219_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_213_cast_fp16 = add(x = inputs_207_cast_fp16, y = obj_219_cast_fp16)[name = string("inputs_213_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_215_cast_fp16 = mul(x = inputs_213_cast_fp16, y = inputs_213_cast_fp16)[name = string("inputs_sq_215_cast_fp16")];
+            tensor<int32, [1]> variance_215_axes_0 = const()[name = string("variance_215_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_215_keep_dims_0 = const()[name = string("variance_215_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_215_cast_fp16 = reduce_mean(axes = variance_215_axes_0, keep_dims = variance_215_keep_dims_0, x = inputs_sq_215_cast_fp16)[name = string("variance_215_cast_fp16")];
+            fp16 var_10575_to_fp16 = const()[name = string("op_10575_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10576_cast_fp16 = add(x = variance_215_cast_fp16, y = var_10575_to_fp16)[name = string("op_10576_cast_fp16")];
+            fp32 var_10577_epsilon_0 = const()[name = string("op_10577_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10577_cast_fp16 = rsqrt(epsilon = var_10577_epsilon_0, x = var_10576_cast_fp16)[name = string("op_10577_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_267_cast_fp16 = mul(x = inputs_213_cast_fp16, y = var_10577_cast_fp16)[name = string("hidden_states_267_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_215_to_fp16 = const()[name = string("w_215_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1321636096)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_211_cast_fp16 = mul(x = w_215_to_fp16, y = hidden_states_267_cast_fp16)[name = string("input_211_cast_fp16")];
+            string input_213_pad_type_0 = const()[name = string("input_213_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_213_strides_0 = const()[name = string("input_213_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_213_pad_0 = const()[name = string("input_213_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_213_dilations_0 = const()[name = string("input_213_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_213_groups_0 = const()[name = string("input_213_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_26_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1321640256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1334223232))))[name = string("layers_26_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_213_cast_fp16 = conv(dilations = input_213_dilations_0, groups = input_213_groups_0, pad = input_213_pad_0, pad_type = input_213_pad_type_0, strides = input_213_strides_0, weight = layers_26_mlp_gate_proj_weight_to_fp16_palettized, x = input_211_cast_fp16)[name = string("input_213_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_10591_cast_fp16 = silu(x = input_213_cast_fp16)[name = string("op_10591_cast_fp16")];
+            string var_10597_pad_type_0 = const()[name = string("op_10597_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10597_strides_0 = const()[name = string("op_10597_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10597_pad_0 = const()[name = string("op_10597_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10597_dilations_0 = const()[name = string("op_10597_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10597_groups_0 = const()[name = string("op_10597_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_26_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1334223808))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1346806784))))[name = string("layers_26_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_10597_cast_fp16 = conv(dilations = var_10597_dilations_0, groups = var_10597_groups_0, pad = var_10597_pad_0, pad_type = var_10597_pad_type_0, strides = var_10597_strides_0, weight = layers_26_mlp_up_proj_weight_to_fp16_palettized, x = input_211_cast_fp16)[name = string("op_10597_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_215_cast_fp16 = mul(x = var_10591_cast_fp16, y = var_10597_cast_fp16)[name = string("input_215_cast_fp16")];
+            string hidden_states_269_pad_type_0 = const()[name = string("hidden_states_269_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_269_strides_0 = const()[name = string("hidden_states_269_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_269_pad_0 = const()[name = string("hidden_states_269_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_269_dilations_0 = const()[name = string("hidden_states_269_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_269_groups_0 = const()[name = string("hidden_states_269_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_26_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1346807360))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1359390336))))[name = string("layers_26_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_269_cast_fp16 = conv(dilations = hidden_states_269_dilations_0, groups = hidden_states_269_groups_0, pad = hidden_states_269_pad_0, pad_type = hidden_states_269_pad_type_0, strides = hidden_states_269_strides_0, weight = layers_26_mlp_down_proj_weight_to_fp16_palettized, x = input_215_cast_fp16)[name = string("hidden_states_269_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_215_cast_fp16 = add(x = inputs_213_cast_fp16, y = hidden_states_269_cast_fp16)[name = string("inputs_215_cast_fp16")];
+            int32 var_10611 = const()[name = string("op_10611"), val = int32(3)];
+            int32 var_10621 = const()[name = string("op_10621"), val = int32(-2)];
+            int32 var_10629 = const()[name = string("op_10629"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_217_cast_fp16 = mul(x = inputs_215_cast_fp16, y = inputs_215_cast_fp16)[name = string("inputs_sq_217_cast_fp16")];
+            tensor<int32, [1]> variance_217_axes_0 = const()[name = string("variance_217_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_217_keep_dims_0 = const()[name = string("variance_217_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_217_cast_fp16 = reduce_mean(axes = variance_217_axes_0, keep_dims = variance_217_keep_dims_0, x = inputs_sq_217_cast_fp16)[name = string("variance_217_cast_fp16")];
+            fp16 var_10641_to_fp16 = const()[name = string("op_10641_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10642_cast_fp16 = add(x = variance_217_cast_fp16, y = var_10641_to_fp16)[name = string("op_10642_cast_fp16")];
+            fp32 var_10643_epsilon_0 = const()[name = string("op_10643_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10643_cast_fp16 = rsqrt(epsilon = var_10643_epsilon_0, x = var_10642_cast_fp16)[name = string("op_10643_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_271_cast_fp16 = mul(x = inputs_215_cast_fp16, y = var_10643_cast_fp16)[name = string("hidden_states_271_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_217_to_fp16 = const()[name = string("w_217_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1359390912)))];
+            tensor<fp16, [1, 2048, 1, 1]> obj_221_cast_fp16 = mul(x = w_217_to_fp16, y = hidden_states_271_cast_fp16)[name = string("obj_221_cast_fp16")];
+            string query_163_pad_type_0 = const()[name = string("query_163_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_163_strides_0 = const()[name = string("query_163_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_163_pad_0 = const()[name = string("query_163_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_163_dilations_0 = const()[name = string("query_163_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_163_groups_0 = const()[name = string("query_163_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_27_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1359395072))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1363589440))))[name = string("layers_27_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_163_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_163_dilations_0, groups = query_163_groups_0, pad = query_163_pad_0, pad_type = query_163_pad_type_0, strides = query_163_strides_0, weight = layers_27_self_attn_q_proj_weight_to_fp16_palettized, x = obj_221_cast_fp16)[name = string("query_163_cast_fp16")];
+            string current_key_109_pad_type_0 = const()[name = string("current_key_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_109_strides_0 = const()[name = string("current_key_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_109_pad_0 = const()[name = string("current_key_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_109_dilations_0 = const()[name = string("current_key_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_109_groups_0 = const()[name = string("current_key_109_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_27_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1363590016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1365687232))))[name = string("layers_27_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_109_cast_fp16 = conv(dilations = current_key_109_dilations_0, groups = current_key_109_groups_0, pad = current_key_109_pad_0, pad_type = current_key_109_pad_type_0, strides = current_key_109_strides_0, weight = layers_27_self_attn_k_proj_weight_to_fp16_palettized, x = obj_221_cast_fp16)[name = string("current_key_109_cast_fp16")];
+            string current_value_pad_type_0 = const()[name = string("current_value_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_strides_0 = const()[name = string("current_value_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_pad_0 = const()[name = string("current_value_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_dilations_0 = const()[name = string("current_value_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_groups_0 = const()[name = string("current_value_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_27_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1365687808))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1367785024))))[name = string("layers_27_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_dilations_0, groups = current_value_groups_0, pad = current_value_pad_0, pad_type = current_value_pad_type_0, strides = current_value_strides_0, weight = layers_27_self_attn_v_proj_weight_to_fp16_palettized, x = obj_221_cast_fp16)[name = string("current_value_cast_fp16")];
+            tensor<int32, [4]> var_10680 = const()[name = string("op_10680"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_217_cast_fp16 = reshape(shape = var_10680, x = query_163_cast_fp16)[name = string("inputs_217_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_219_cast_fp16 = mul(x = inputs_217_cast_fp16, y = inputs_217_cast_fp16)[name = string("inputs_sq_219_cast_fp16")];
+            tensor<int32, [1]> variance_219_axes_0 = const()[name = string("variance_219_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_219_keep_dims_0 = const()[name = string("variance_219_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_219_cast_fp16 = reduce_mean(axes = variance_219_axes_0, keep_dims = variance_219_keep_dims_0, x = inputs_sq_219_cast_fp16)[name = string("variance_219_cast_fp16")];
+            fp16 var_10686_to_fp16 = const()[name = string("op_10686_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_10687_cast_fp16 = add(x = variance_219_cast_fp16, y = var_10686_to_fp16)[name = string("op_10687_cast_fp16")];
+            fp32 var_10688_epsilon_0 = const()[name = string("op_10688_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_10688_cast_fp16 = rsqrt(epsilon = var_10688_epsilon_0, x = var_10687_cast_fp16)[name = string("op_10688_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_273_cast_fp16 = mul(x = inputs_217_cast_fp16, y = var_10688_cast_fp16)[name = string("hidden_states_273_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_219_to_fp16 = const()[name = string("w_219_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1367785600)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_cast_fp16 = mul(x = w_219_to_fp16, y = hidden_states_273_cast_fp16)[name = string("query_normed_cast_fp16")];
+            tensor<int32, [4]> var_10696 = const()[name = string("op_10696"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_219_cast_fp16 = reshape(shape = var_10696, x = current_key_109_cast_fp16)[name = string("inputs_219_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_221_cast_fp16 = mul(x = inputs_219_cast_fp16, y = inputs_219_cast_fp16)[name = string("inputs_sq_221_cast_fp16")];
+            tensor<int32, [1]> variance_221_axes_0 = const()[name = string("variance_221_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_221_keep_dims_0 = const()[name = string("variance_221_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_221_cast_fp16 = reduce_mean(axes = variance_221_axes_0, keep_dims = variance_221_keep_dims_0, x = inputs_sq_221_cast_fp16)[name = string("variance_221_cast_fp16")];
+            fp16 var_10702_to_fp16 = const()[name = string("op_10702_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_10703_cast_fp16 = add(x = variance_221_cast_fp16, y = var_10702_to_fp16)[name = string("op_10703_cast_fp16")];
+            fp32 var_10704_epsilon_0 = const()[name = string("op_10704_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_10704_cast_fp16 = rsqrt(epsilon = var_10704_epsilon_0, x = var_10703_cast_fp16)[name = string("op_10704_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_275_cast_fp16 = mul(x = inputs_219_cast_fp16, y = var_10704_cast_fp16)[name = string("hidden_states_275_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_221_to_fp16 = const()[name = string("w_221_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1367785920)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_cast_fp16 = mul(x = w_221_to_fp16, y = hidden_states_275_cast_fp16)[name = string("current_key_normed_cast_fp16")];
+            tensor<int32, [4]> var_10722 = const()[name = string("op_10722"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_163_cast_fp16 = reshape(shape = var_10722, x = query_normed_cast_fp16)[name = string("mh_q_163_cast_fp16")];
+            tensor<int32, [4]> var_10724 = const()[name = string("op_10724"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_109_cast_fp16 = reshape(shape = var_10724, x = current_key_normed_cast_fp16)[name = string("mh_k_109_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10728_cast_fp16 = mul(x = mh_q_163_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10728_cast_fp16")];
+            tensor<int32, [4]> var_10733_begin_0 = const()[name = string("op_10733_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10733_end_0 = const()[name = string("op_10733_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_10733_end_mask_0 = const()[name = string("op_10733_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10733_cast_fp16 = slice_by_index(begin = var_10733_begin_0, end = var_10733_end_0, end_mask = var_10733_end_mask_0, x = mh_q_163_cast_fp16)[name = string("op_10733_cast_fp16")];
+            tensor<int32, [4]> var_10739_begin_0 = const()[name = string("op_10739_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10739_end_0 = const()[name = string("op_10739_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_10739_end_mask_0 = const()[name = string("op_10739_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_10739_cast_fp16 = slice_by_index(begin = var_10739_begin_0, end = var_10739_end_0, end_mask = var_10739_end_mask_0, x = mh_q_163_cast_fp16)[name = string("op_10739_cast_fp16")];
+            fp16 const_638_promoted_to_fp16 = const()[name = string("const_638_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_10741_cast_fp16 = mul(x = var_10739_cast_fp16, y = const_638_promoted_to_fp16)[name = string("op_10741_cast_fp16")];
+            bool var_10743_interleave_0 = const()[name = string("op_10743_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_10743_cast_fp16 = concat(axis = var_10621, interleave = var_10743_interleave_0, values = (var_10741_cast_fp16, var_10733_cast_fp16))[name = string("op_10743_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_10744_cast_fp16 = mul(x = var_10743_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10744_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_165_cast_fp16 = add(x = var_10728_cast_fp16, y = var_10744_cast_fp16)[name = string("mh_q_165_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10746_cast_fp16 = mul(x = mh_k_109_cast_fp16, y = cos_1_cast_fp16)[name = string("op_10746_cast_fp16")];
+            tensor<int32, [4]> var_10751_begin_0 = const()[name = string("op_10751_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10751_end_0 = const()[name = string("op_10751_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_10751_end_mask_0 = const()[name = string("op_10751_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10751_cast_fp16 = slice_by_index(begin = var_10751_begin_0, end = var_10751_end_0, end_mask = var_10751_end_mask_0, x = mh_k_109_cast_fp16)[name = string("op_10751_cast_fp16")];
+            tensor<int32, [4]> var_10757_begin_0 = const()[name = string("op_10757_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_10757_end_0 = const()[name = string("op_10757_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_10757_end_mask_0 = const()[name = string("op_10757_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_10757_cast_fp16 = slice_by_index(begin = var_10757_begin_0, end = var_10757_end_0, end_mask = var_10757_end_mask_0, x = mh_k_109_cast_fp16)[name = string("op_10757_cast_fp16")];
+            fp16 const_641_promoted_to_fp16 = const()[name = string("const_641_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_10759_cast_fp16 = mul(x = var_10757_cast_fp16, y = const_641_promoted_to_fp16)[name = string("op_10759_cast_fp16")];
+            bool var_10761_interleave_0 = const()[name = string("op_10761_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_10761_cast_fp16 = concat(axis = var_10621, interleave = var_10761_interleave_0, values = (var_10759_cast_fp16, var_10751_cast_fp16))[name = string("op_10761_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_10762_cast_fp16 = mul(x = var_10761_cast_fp16, y = sin_1_cast_fp16)[name = string("op_10762_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_cast_fp16 = add(x = var_10746_cast_fp16, y = var_10762_cast_fp16)[name = string("mh_k_cast_fp16")];
+            tensor<int32, [4]> var_10766 = const()[name = string("op_10766"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_10766, x = mh_k_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10773_cast_fp16 = mul(x = var_101_cast_fp16_27, y = var_323_cast_fp16)[name = string("op_10773_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10774_cast_fp16 = mul(x = current_key_cast_fp16, y = var_321_cast_fp16)[name = string("op_10774_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_165_cast_fp16 = add(x = var_10773_cast_fp16, y = var_10774_cast_fp16)[name = string("key_165_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10777_cast_fp16 = mul(x = var_132_cast_fp16_27, y = var_323_cast_fp16)[name = string("op_10777_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_10778_cast_fp16 = mul(x = current_value_cast_fp16, y = var_321_cast_fp16)[name = string("op_10778_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_109_cast_fp16 = add(x = var_10777_cast_fp16, y = var_10778_cast_fp16)[name = string("value_109_cast_fp16")];
+            tensor<int32, [4]> var_10782 = const()[name = string("op_10782"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> key_heads_109_cast_fp16 = reshape(shape = var_10782, x = key_165_cast_fp16)[name = string("key_heads_109_cast_fp16")];
+            tensor<int32, [4]> var_10784 = const()[name = string("op_10784"), val = tensor<int32, [4]>([1, 8, 128, 256])];
+            tensor<fp16, [1, 8, 128, 256]> value_heads_109_cast_fp16 = reshape(shape = var_10784, x = value_109_cast_fp16)[name = string("value_heads_109_cast_fp16")];
+            tensor<int32, [4]> var_10787_begin_0 = const()[name = string("op_10787_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10787_end_0 = const()[name = string("op_10787_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10787_end_mask_0 = const()[name = string("op_10787_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10787_cast_fp16 = slice_by_index(begin = var_10787_begin_0, end = var_10787_end_0, end_mask = var_10787_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10787_cast_fp16")];
+            tensor<int32, [4]> var_10791_begin_0 = const()[name = string("op_10791_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_10791_end_0 = const()[name = string("op_10791_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10791_end_mask_0 = const()[name = string("op_10791_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10791_cast_fp16 = slice_by_index(begin = var_10791_begin_0, end = var_10791_end_0, end_mask = var_10791_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10791_cast_fp16")];
+            tensor<int32, [4]> var_10803_begin_0 = const()[name = string("op_10803_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10803_end_0 = const()[name = string("op_10803_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10803_end_mask_0 = const()[name = string("op_10803_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10803_cast_fp16 = slice_by_index(begin = var_10803_begin_0, end = var_10803_end_0, end_mask = var_10803_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10803_cast_fp16")];
+            tensor<int32, [4]> var_10807_begin_0 = const()[name = string("op_10807_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_10807_end_0 = const()[name = string("op_10807_end_0"), val = tensor<int32, [4]>([1, 2, 128, 256])];
+            tensor<bool, [4]> var_10807_end_mask_0 = const()[name = string("op_10807_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10807_cast_fp16 = slice_by_index(begin = var_10807_begin_0, end = var_10807_end_0, end_mask = var_10807_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10807_cast_fp16")];
+            tensor<int32, [4]> var_10819_begin_0 = const()[name = string("op_10819_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10819_end_0 = const()[name = string("op_10819_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10819_end_mask_0 = const()[name = string("op_10819_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10819_cast_fp16 = slice_by_index(begin = var_10819_begin_0, end = var_10819_end_0, end_mask = var_10819_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10819_cast_fp16")];
+            tensor<int32, [4]> var_10823_begin_0 = const()[name = string("op_10823_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_10823_end_0 = const()[name = string("op_10823_end_0"), val = tensor<int32, [4]>([1, 3, 128, 256])];
+            tensor<bool, [4]> var_10823_end_mask_0 = const()[name = string("op_10823_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10823_cast_fp16 = slice_by_index(begin = var_10823_begin_0, end = var_10823_end_0, end_mask = var_10823_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10823_cast_fp16")];
+            tensor<int32, [4]> var_10835_begin_0 = const()[name = string("op_10835_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10835_end_0 = const()[name = string("op_10835_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10835_end_mask_0 = const()[name = string("op_10835_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10835_cast_fp16 = slice_by_index(begin = var_10835_begin_0, end = var_10835_end_0, end_mask = var_10835_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10835_cast_fp16")];
+            tensor<int32, [4]> var_10839_begin_0 = const()[name = string("op_10839_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_10839_end_0 = const()[name = string("op_10839_end_0"), val = tensor<int32, [4]>([1, 4, 128, 256])];
+            tensor<bool, [4]> var_10839_end_mask_0 = const()[name = string("op_10839_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10839_cast_fp16 = slice_by_index(begin = var_10839_begin_0, end = var_10839_end_0, end_mask = var_10839_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10839_cast_fp16")];
+            tensor<int32, [4]> var_10851_begin_0 = const()[name = string("op_10851_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10851_end_0 = const()[name = string("op_10851_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10851_end_mask_0 = const()[name = string("op_10851_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10851_cast_fp16 = slice_by_index(begin = var_10851_begin_0, end = var_10851_end_0, end_mask = var_10851_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10851_cast_fp16")];
+            tensor<int32, [4]> var_10855_begin_0 = const()[name = string("op_10855_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_10855_end_0 = const()[name = string("op_10855_end_0"), val = tensor<int32, [4]>([1, 5, 128, 256])];
+            tensor<bool, [4]> var_10855_end_mask_0 = const()[name = string("op_10855_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10855_cast_fp16 = slice_by_index(begin = var_10855_begin_0, end = var_10855_end_0, end_mask = var_10855_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10855_cast_fp16")];
+            tensor<int32, [4]> var_10867_begin_0 = const()[name = string("op_10867_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10867_end_0 = const()[name = string("op_10867_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10867_end_mask_0 = const()[name = string("op_10867_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10867_cast_fp16 = slice_by_index(begin = var_10867_begin_0, end = var_10867_end_0, end_mask = var_10867_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10867_cast_fp16")];
+            tensor<int32, [4]> var_10871_begin_0 = const()[name = string("op_10871_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_10871_end_0 = const()[name = string("op_10871_end_0"), val = tensor<int32, [4]>([1, 6, 128, 256])];
+            tensor<bool, [4]> var_10871_end_mask_0 = const()[name = string("op_10871_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10871_cast_fp16 = slice_by_index(begin = var_10871_begin_0, end = var_10871_end_0, end_mask = var_10871_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10871_cast_fp16")];
+            tensor<int32, [4]> var_10883_begin_0 = const()[name = string("op_10883_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10883_end_0 = const()[name = string("op_10883_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10883_end_mask_0 = const()[name = string("op_10883_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10883_cast_fp16 = slice_by_index(begin = var_10883_begin_0, end = var_10883_end_0, end_mask = var_10883_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10883_cast_fp16")];
+            tensor<int32, [4]> var_10887_begin_0 = const()[name = string("op_10887_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_10887_end_0 = const()[name = string("op_10887_end_0"), val = tensor<int32, [4]>([1, 7, 128, 256])];
+            tensor<bool, [4]> var_10887_end_mask_0 = const()[name = string("op_10887_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10887_cast_fp16 = slice_by_index(begin = var_10887_begin_0, end = var_10887_end_0, end_mask = var_10887_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10887_cast_fp16")];
+            tensor<int32, [4]> var_10899_begin_0 = const()[name = string("op_10899_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10899_end_0 = const()[name = string("op_10899_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10899_end_mask_0 = const()[name = string("op_10899_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10899_cast_fp16 = slice_by_index(begin = var_10899_begin_0, end = var_10899_end_0, end_mask = var_10899_end_mask_0, x = key_heads_109_cast_fp16)[name = string("op_10899_cast_fp16")];
+            tensor<int32, [4]> var_10903_begin_0 = const()[name = string("op_10903_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_10903_end_0 = const()[name = string("op_10903_end_0"), val = tensor<int32, [4]>([1, 1, 128, 256])];
+            tensor<bool, [4]> var_10903_end_mask_0 = const()[name = string("op_10903_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 256]> var_10903_cast_fp16 = slice_by_index(begin = var_10903_begin_0, end = var_10903_end_0, end_mask = var_10903_end_mask_0, x = value_heads_109_cast_fp16)[name = string("op_10903_cast_fp16")];
+            bool key_heads_interleave_0 = const()[name = string("key_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> key_heads_cast_fp16 = concat(axis = var_10629, interleave = key_heads_interleave_0, values = (var_10787_cast_fp16, var_10787_cast_fp16, var_10803_cast_fp16, var_10803_cast_fp16, var_10819_cast_fp16, var_10819_cast_fp16, var_10835_cast_fp16, var_10835_cast_fp16, var_10851_cast_fp16, var_10851_cast_fp16, var_10867_cast_fp16, var_10867_cast_fp16, var_10883_cast_fp16, var_10883_cast_fp16, var_10899_cast_fp16, var_10899_cast_fp16))[name = string("key_heads_cast_fp16")];
+            bool value_heads_interleave_0 = const()[name = string("value_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 256]> value_heads_cast_fp16 = concat(axis = var_10629, interleave = value_heads_interleave_0, values = (var_10791_cast_fp16, var_10791_cast_fp16, var_10807_cast_fp16, var_10807_cast_fp16, var_10823_cast_fp16, var_10823_cast_fp16, var_10839_cast_fp16, var_10839_cast_fp16, var_10855_cast_fp16, var_10855_cast_fp16, var_10871_cast_fp16, var_10871_cast_fp16, var_10887_cast_fp16, var_10887_cast_fp16, var_10903_cast_fp16, var_10903_cast_fp16))[name = string("value_heads_cast_fp16")];
+            fp16 var_10926_to_fp16 = const()[name = string("op_10926_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_10927_cast_fp16 = mul(x = mh_q_165_cast_fp16, y = var_10926_to_fp16)[name = string("op_10927_cast_fp16")];
+            bool mh_w_109_transpose_x_0 = const()[name = string("mh_w_109_transpose_x_0"), val = bool(true)];
+            bool mh_w_109_transpose_y_0 = const()[name = string("mh_w_109_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_109_cast_fp16 = matmul(transpose_x = mh_w_109_transpose_x_0, transpose_y = mh_w_109_transpose_y_0, x = var_10927_cast_fp16, y = key_heads_cast_fp16)[name = string("mh_w_109_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> mh_w_cast_fp16 = add(x = mh_w_109_cast_fp16, y = var_487_cast_fp16)[name = string("mh_w_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> var_10939_cast_fp16 = softmax(axis = var_10611, x = mh_w_cast_fp16)[name = string("op_10939_cast_fp16")];
+            bool attn_transpose_x_0 = const()[name = string("attn_transpose_x_0"), val = bool(false)];
+            bool attn_transpose_y_0 = const()[name = string("attn_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_cast_fp16 = matmul(transpose_x = attn_transpose_x_0, transpose_y = attn_transpose_y_0, x = value_heads_cast_fp16, y = var_10939_cast_fp16)[name = string("attn_cast_fp16")];
+            tensor<int32, [4]> var_10944 = const()[name = string("op_10944"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_217_cast_fp16 = reshape(shape = var_10944, x = attn_cast_fp16)[name = string("input_217_cast_fp16")];
+            string obj_pad_type_0 = const()[name = string("obj_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_strides_0 = const()[name = string("obj_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_pad_0 = const()[name = string("obj_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_dilations_0 = const()[name = string("obj_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_groups_0 = const()[name = string("obj_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> layers_27_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1367786240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1371980608))))[name = string("layers_27_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> obj_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = obj_dilations_0, groups = obj_groups_0, pad = obj_pad_0, pad_type = obj_pad_type_0, strides = obj_strides_0, weight = layers_27_self_attn_o_proj_weight_to_fp16_palettized, x = input_217_cast_fp16)[name = string("obj_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_221_cast_fp16 = add(x = inputs_215_cast_fp16, y = obj_cast_fp16)[name = string("inputs_221_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_223_cast_fp16 = mul(x = inputs_221_cast_fp16, y = inputs_221_cast_fp16)[name = string("inputs_sq_223_cast_fp16")];
+            tensor<int32, [1]> variance_223_axes_0 = const()[name = string("variance_223_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_223_keep_dims_0 = const()[name = string("variance_223_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_223_cast_fp16 = reduce_mean(axes = variance_223_axes_0, keep_dims = variance_223_keep_dims_0, x = inputs_sq_223_cast_fp16)[name = string("variance_223_cast_fp16")];
+            fp16 var_10962_to_fp16 = const()[name = string("op_10962_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_10963_cast_fp16 = add(x = variance_223_cast_fp16, y = var_10962_to_fp16)[name = string("op_10963_cast_fp16")];
+            fp32 var_10964_epsilon_0 = const()[name = string("op_10964_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_10964_cast_fp16 = rsqrt(epsilon = var_10964_epsilon_0, x = var_10963_cast_fp16)[name = string("op_10964_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_277_cast_fp16 = mul(x = inputs_221_cast_fp16, y = var_10964_cast_fp16)[name = string("hidden_states_277_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_223_to_fp16 = const()[name = string("w_223_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1371981184)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_219_cast_fp16 = mul(x = w_223_to_fp16, y = hidden_states_277_cast_fp16)[name = string("input_219_cast_fp16")];
+            string input_221_pad_type_0 = const()[name = string("input_221_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_221_strides_0 = const()[name = string("input_221_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_221_pad_0 = const()[name = string("input_221_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_221_dilations_0 = const()[name = string("input_221_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_221_groups_0 = const()[name = string("input_221_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_27_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1371985344))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1384568320))))[name = string("layers_27_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> input_221_cast_fp16 = conv(dilations = input_221_dilations_0, groups = input_221_groups_0, pad = input_221_pad_0, pad_type = input_221_pad_type_0, strides = input_221_strides_0, weight = layers_27_mlp_gate_proj_weight_to_fp16_palettized, x = input_219_cast_fp16)[name = string("input_221_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> var_10978_cast_fp16 = silu(x = input_221_cast_fp16)[name = string("op_10978_cast_fp16")];
+            string var_10984_pad_type_0 = const()[name = string("op_10984_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10984_strides_0 = const()[name = string("op_10984_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10984_pad_0 = const()[name = string("op_10984_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10984_dilations_0 = const()[name = string("op_10984_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10984_groups_0 = const()[name = string("op_10984_groups_0"), val = int32(1)];
+            tensor<fp16, [6144, 2048, 1, 1]> layers_27_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [6144, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1384568896))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1397151872))))[name = string("layers_27_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 6144, 1, 1]> var_10984_cast_fp16 = conv(dilations = var_10984_dilations_0, groups = var_10984_groups_0, pad = var_10984_pad_0, pad_type = var_10984_pad_type_0, strides = var_10984_strides_0, weight = layers_27_mlp_up_proj_weight_to_fp16_palettized, x = input_219_cast_fp16)[name = string("op_10984_cast_fp16")];
+            tensor<fp16, [1, 6144, 1, 1]> input_223_cast_fp16 = mul(x = var_10978_cast_fp16, y = var_10984_cast_fp16)[name = string("input_223_cast_fp16")];
+            string hidden_states_279_pad_type_0 = const()[name = string("hidden_states_279_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_279_strides_0 = const()[name = string("hidden_states_279_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_279_pad_0 = const()[name = string("hidden_states_279_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_279_dilations_0 = const()[name = string("hidden_states_279_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_279_groups_0 = const()[name = string("hidden_states_279_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 6144, 1, 1]> layers_27_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 6144, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1397152448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1409735424))))[name = string("layers_27_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_279_cast_fp16 = conv(dilations = hidden_states_279_dilations_0, groups = hidden_states_279_groups_0, pad = hidden_states_279_pad_0, pad_type = hidden_states_279_pad_type_0, strides = hidden_states_279_strides_0, weight = layers_27_mlp_down_proj_weight_to_fp16_palettized, x = input_223_cast_fp16)[name = string("hidden_states_279_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_cast_fp16 = add(x = inputs_221_cast_fp16, y = hidden_states_279_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_11005_to_fp16 = const()[name = string("op_11005_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_11006_cast_fp16 = add(x = variance_cast_fp16, y = var_11005_to_fp16)[name = string("op_11006_cast_fp16")];
+            fp32 var_11007_epsilon_0 = const()[name = string("op_11007_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_11007_cast_fp16 = rsqrt(epsilon = var_11007_epsilon_0, x = var_11006_cast_fp16)[name = string("op_11007_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_11007_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1409736000)))];
+            tensor<fp16, [1, 2048, 1, 1]> hidden_states = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_cast_fp16")];
+            string logits_pad_type_0 = const()[name = string("logits_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_strides_0 = const()[name = string("logits_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_pad_0 = const()[name = string("logits_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_dilations_0 = const()[name = string("logits_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_groups_0 = const()[name = string("logits_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 2048, 1, 1]> codec_head_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1409740160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1416031680))))[name = string("codec_head_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> logits_cast_fp16 = conv(dilations = logits_dilations_0, groups = logits_groups_0, pad = logits_pad_0, pad_type = logits_pad_type_0, strides = logits_strides_0, weight = codec_head_weight_to_fp16_palettized, x = hidden_states)[name = string("logits_cast_fp16")];
+            tensor<int32, [1]> var_11024_axes_0 = const()[name = string("op_11024_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 3072, 1]> var_11024_cast_fp16 = squeeze(axes = var_11024_axes_0, x = logits_cast_fp16)[name = string("op_11024_cast_fp16")];
+            tensor<int32, [3]> var_11027_perm_0 = const()[name = string("op_11027_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_11029 = const()[name = string("op_11029"), val = int32(1)];
+            bool var_11030_interleave_0 = const()[name = string("op_11030_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 28672, 1, 1]> key_cache_updates = concat(axis = var_11029, interleave = var_11030_interleave_0, values = (current_key_3_cast_fp16, current_key_7_cast_fp16, current_key_11_cast_fp16, current_key_15_cast_fp16, current_key_19_cast_fp16, current_key_23_cast_fp16, current_key_27_cast_fp16, current_key_31_cast_fp16, current_key_35_cast_fp16, current_key_39_cast_fp16, current_key_43_cast_fp16, current_key_47_cast_fp16, current_key_51_cast_fp16, current_key_55_cast_fp16, current_key_59_cast_fp16, current_key_63_cast_fp16, current_key_67_cast_fp16, current_key_71_cast_fp16, current_key_75_cast_fp16, current_key_79_cast_fp16, current_key_83_cast_fp16, current_key_87_cast_fp16, current_key_91_cast_fp16, current_key_95_cast_fp16, current_key_99_cast_fp16, current_key_103_cast_fp16, current_key_107_cast_fp16, current_key_cast_fp16))[name = string("op_11030_cast_fp16")];
+            int32 var_11032 = const()[name = string("op_11032"), val = int32(1)];
+            bool var_11033_interleave_0 = const()[name = string("op_11033_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 28672, 1, 1]> value_cache_updates = concat(axis = var_11032, interleave = var_11033_interleave_0, values = (current_value_1_cast_fp16, current_value_3_cast_fp16, current_value_5_cast_fp16, current_value_7_cast_fp16, current_value_9_cast_fp16, current_value_11_cast_fp16, current_value_13_cast_fp16, current_value_15_cast_fp16, current_value_17_cast_fp16, current_value_19_cast_fp16, current_value_21_cast_fp16, current_value_23_cast_fp16, current_value_25_cast_fp16, current_value_27_cast_fp16, current_value_29_cast_fp16, current_value_31_cast_fp16, current_value_33_cast_fp16, current_value_35_cast_fp16, current_value_37_cast_fp16, current_value_39_cast_fp16, current_value_41_cast_fp16, current_value_43_cast_fp16, current_value_45_cast_fp16, current_value_47_cast_fp16, current_value_49_cast_fp16, current_value_51_cast_fp16, current_value_53_cast_fp16, current_value_cast_fp16))[name = string("op_11033_cast_fp16")];
+            tensor<fp16, [1, 1, 3072]> logits = transpose(perm = var_11027_perm_0, x = var_11024_cast_fp16)[name = string("transpose_0")];
+        } -> (logits, hidden_states, key_cache_updates, value_cache_updates);
+}
\ No newline at end of file
diff --git a/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/weights/weight.bin b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d2def9008bd5f75bca6494dbec97e3618f98ace6
--- /dev/null
+++ b/qwen3_tts/code_decoder/12hz-1.7b-customvoice/W8A16-stateful/CodeDecoder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc0cb784a9df3f11cb4c12710e962ab108d84081f25d7e0f068a0eb0aa3ace7
+size 1416032256
diff --git a/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..392e99bff74f47dcda99b392bec2e65ec9be33b7
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5880de6256184e5a3bdbbdd631e8fccf030f2cf82d8930c36bc9542cd5d31645
+size 243
diff --git a/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/coremldata.bin b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6360cd141dca97d7efe7d6151d4e653eea535afe
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d19846f4edbd27cef1e45f4c155aad182318620c53e94897a21622774c604b8
+size 380
diff --git a/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/metadata.json b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..28c3029c63b09dfc449e81d74ed5902d7fcda356
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/metadata.json
@@ -0,0 +1,66 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.greaterEqual" : 1,
+      "Ios18.add" : 1,
+      "Ios18.cast" : 3,
+      "Select" : 1,
+      "Ios18.gather" : 1,
+      "Ios18.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-06",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "CodeEmbedder",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/model.mil b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..ebedb75fbcda18db00cc8d87d60d6b05ddb2d4ad
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/model.mil
@@ -0,0 +1,26 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.8.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios18>(tensor<int32, [1]> input_ids) {
+            int32 embeddings_batch_dims_0 = const()[name = string("embeddings_batch_dims_0"), val = int32(0)];
+            bool embeddings_validate_indices_0 = const()[name = string("embeddings_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [3072, 1024]> codec_embedding_weight_to_fp16 = const()[name = string("codec_embedding_weight_to_fp16"), val = tensor<fp16, [3072, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string input_ids_to_int16_dtype_0 = const()[name = string("input_ids_to_int16_dtype_0"), val = string("int16")];
+            string cast_2_dtype_0 = const()[name = string("cast_2_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> input_ids_to_int16 = cast(dtype = input_ids_to_int16_dtype_0, x = input_ids)[name = string("cast_5")];
+            tensor<int32, [1]> cast_2 = cast(dtype = cast_2_dtype_0, x = input_ids_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_2, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(3072)];
+            tensor<int32, [1]> add_0 = add(x = cast_2, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_2, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 embeddings_cast_fp16_cast_uint16_axis_0 = const()[name = string("embeddings_cast_fp16_cast_uint16_axis_0"), val = int32(0)];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<fp16, [1, 1024]> embeddings_cast_fp16_cast_uint16_cast_uint16 = gather(axis = embeddings_cast_fp16_cast_uint16_axis_0, batch_dims = embeddings_batch_dims_0, indices = select_0_to_int16, validate_indices = embeddings_validate_indices_0, x = codec_embedding_weight_to_fp16)[name = string("embeddings_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> var_18_axes_0 = const()[name = string("op_18_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 1]> var_18_cast_fp16 = expand_dims(axes = var_18_axes_0, x = embeddings_cast_fp16_cast_uint16_cast_uint16)[name = string("op_18_cast_fp16")];
+            tensor<int32, [1]> var_20_axes_0 = const()[name = string("op_20_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 1024, 1, 1]> input_embeds = expand_dims(axes = var_20_axes_0, x = var_18_cast_fp16)[name = string("op_20_cast_fp16")];
+        } -> (input_embeds);
+}
\ No newline at end of file
diff --git a/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/weights/weight.bin b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..395369ff6611d73a276a482d988ceb4e66bf23e6
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-0.6b-customvoice/W16A16/CodeEmbedder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bf4059e34abbb08bea47b47e1cc305a08c690c19e42526aa97c14e9dd66b4b4
+size 6291584
diff --git a/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4fe073d9255356d1210d024c61643829c4ae4449
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a67276394d403612b21f2042840c89e8b5feeb0d76162b0b10dc9f5be3273242
+size 243
diff --git a/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/coremldata.bin b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..222daeb80d3afb4a42e608c93399661e02a723b3
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52bea537a5853fa5f98b869ca6c68e052febb5257d5b5a5a14be9148e4599e6b
+size 380
diff --git a/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/metadata.json b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..af32edbeb2c0fc891fb49c493bf30cb473655870
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/metadata.json
@@ -0,0 +1,66 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2048 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 2048, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.greaterEqual" : 1,
+      "Ios18.add" : 1,
+      "Ios18.cast" : 3,
+      "Select" : 1,
+      "Ios18.gather" : 1,
+      "Ios18.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-12",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "CodeEmbedder",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/model.mil b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..ddf119b66162f4ec8743c9ac87561941c7ad6a78
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/model.mil
@@ -0,0 +1,26 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.8.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios18>(tensor<int32, [1]> input_ids) {
+            int32 embeddings_batch_dims_0 = const()[name = string("embeddings_batch_dims_0"), val = int32(0)];
+            bool embeddings_validate_indices_0 = const()[name = string("embeddings_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [3072, 2048]> codec_embedding_weight_to_fp16 = const()[name = string("codec_embedding_weight_to_fp16"), val = tensor<fp16, [3072, 2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string input_ids_to_int16_dtype_0 = const()[name = string("input_ids_to_int16_dtype_0"), val = string("int16")];
+            string cast_2_dtype_0 = const()[name = string("cast_2_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> input_ids_to_int16 = cast(dtype = input_ids_to_int16_dtype_0, x = input_ids)[name = string("cast_5")];
+            tensor<int32, [1]> cast_2 = cast(dtype = cast_2_dtype_0, x = input_ids_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_2, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(3072)];
+            tensor<int32, [1]> add_0 = add(x = cast_2, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_2, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 embeddings_cast_fp16_cast_uint16_axis_0 = const()[name = string("embeddings_cast_fp16_cast_uint16_axis_0"), val = int32(0)];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<fp16, [1, 2048]> embeddings_cast_fp16_cast_uint16_cast_uint16 = gather(axis = embeddings_cast_fp16_cast_uint16_axis_0, batch_dims = embeddings_batch_dims_0, indices = select_0_to_int16, validate_indices = embeddings_validate_indices_0, x = codec_embedding_weight_to_fp16)[name = string("embeddings_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> var_18_axes_0 = const()[name = string("op_18_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2048, 1]> var_18_cast_fp16 = expand_dims(axes = var_18_axes_0, x = embeddings_cast_fp16_cast_uint16_cast_uint16)[name = string("op_18_cast_fp16")];
+            tensor<int32, [1]> var_20_axes_0 = const()[name = string("op_20_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1, 1]> input_embeds = expand_dims(axes = var_20_axes_0, x = var_18_cast_fp16)[name = string("op_20_cast_fp16")];
+        } -> (input_embeds);
+}
\ No newline at end of file
diff --git a/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/weights/weight.bin b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56d238add0a50554e5af2c3a76d32933d96c767a
--- /dev/null
+++ b/qwen3_tts/code_embedder/12hz-1.7b-customvoice/W16A16/CodeEmbedder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb500c72e2fb86bbba8b89e6810c64485e177bb4bcfc3725373d2c756f28b624
+size 12583040
diff --git a/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..426d8adc16f363283076cef1d932424dd35a5d10
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7096ffa7daeda1488227df4f181dd3a628d53238752488eb4f64c08e547992a0
+size 243
diff --git a/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/coremldata.bin b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f6b8da178e5a53b3cb4debaef161e517768e83e8
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff1f98f3c58bd419916a56b58d94bf2a203efbbab376e7b12353a6eb7ec394a9
+size 611
diff --git a/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/metadata.json b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf124f44f4c572fcb8f859d1af39f00b793b9088
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/metadata.json
@@ -0,0 +1,151 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 15 × 2048)",
+        "shortDescription" : "",
+        "shape" : "[1, 15, 2048]",
+        "name" : "all_logits",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 1]",
+        "name" : "key_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 1]",
+        "name" : "value_cache_updates",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 8,
+      "Ios18.softmax" : 5,
+      "Ios18.mul" : 123,
+      "Ios18.matmul" : 10,
+      "Ios18.rsqrt" : 21,
+      "Ios16.reduceMean" : 21,
+      "Split" : 2,
+      "Ios18.greaterEqual" : 2,
+      "Select" : 2,
+      "Ios18.gather" : 2,
+      "Ios18.add" : 58,
+      "Ios18.reshape" : 40,
+      "Ios18.constexprLutToDense" : 50,
+      "Ios18.conv" : 50,
+      "Ios18.concat" : 23,
+      "Ios18.cast" : 5,
+      "Ios18.sub" : 1,
+      "Ios18.silu" : 5,
+      "Ios18.transpose" : 1,
+      "Ios18.sliceByIndex" : 100,
+      "Ios18.squeeze" : 15
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-12",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 16]",
+        "name" : "key_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 16]",
+        "name" : "value_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 16]",
+        "name" : "kv_cache_update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 16]",
+        "name" : "key_padding_mask",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "MultiCodeDecoder_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..baf7a4dd1948a0c892c8c130d9ee45d36d0ab8d7
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil
@@ -0,0 +1,1369 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> cache_length, tensor<fp16, [1, 1024, 1, 1]> input_embeds, tensor<fp16, [1, 5120, 1, 16]> key_cache, tensor<fp16, [1, 16]> key_padding_mask, tensor<fp16, [1, 16]> kv_cache_update_mask, tensor<fp16, [1, 5120, 1, 16]> value_cache) {
+            int32 pos_cos_batch_dims_0 = const()[name = string("pos_cos_batch_dims_0"), val = int32(0)];
+            bool pos_cos_validate_indices_0 = const()[name = string("pos_cos_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_cos_weight_to_fp16 = const()[name = string("position_embeddings_cos_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string cache_length_to_int16_dtype_0 = const()[name = string("cache_length_to_int16_dtype_0"), val = string("int16")];
+            string cast_111_dtype_0 = const()[name = string("cast_111_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> cache_length_to_int16 = cast(dtype = cache_length_to_int16_dtype_0, x = cache_length)[name = string("cast_5")];
+            tensor<int32, [1]> cast_111 = cast(dtype = cast_111_dtype_0, x = cache_length_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_111, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(16)];
+            tensor<int32, [1]> add_0 = add(x = cast_111, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_111, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            string cast_0_dtype_0 = const()[name = string("cast_0_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<int32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = select_0_to_int16)[name = string("cast_2")];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = cast_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(16)];
+            tensor<int32, [1]> add_0_1 = add(x = cast_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = cast_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0 = const()[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 128]> pos_cos_cast_fp16_cast_uint16_cast_uint16 = gather(axis = pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0, batch_dims = pos_cos_batch_dims_0, indices = select_0_1, validate_indices = pos_cos_validate_indices_0, x = position_embeddings_cos_weight_to_fp16)[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> obj_7_axes_0 = const()[name = string("obj_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_7_cast_fp16 = expand_dims(axes = obj_7_axes_0, x = pos_cos_cast_fp16_cast_uint16_cast_uint16)[name = string("obj_7_cast_fp16")];
+            int32 pos_sin_axis_0 = const()[name = string("pos_sin_axis_0"), val = int32(0)];
+            int32 pos_sin_batch_dims_0 = const()[name = string("pos_sin_batch_dims_0"), val = int32(0)];
+            bool pos_sin_validate_indices_0 = const()[name = string("pos_sin_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_sin_weight_to_fp16 = const()[name = string("position_embeddings_sin_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4224)))];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_1")];
+            tensor<fp16, [1, 128]> pos_sin_cast_fp16_cast_uint16 = gather(axis = pos_sin_axis_0, batch_dims = pos_sin_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_validate_indices_0, x = position_embeddings_sin_weight_to_fp16)[name = string("pos_sin_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> obj_9_axes_0 = const()[name = string("obj_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_9_cast_fp16 = expand_dims(axes = obj_9_axes_0, x = pos_sin_cast_fp16_cast_uint16)[name = string("obj_9_cast_fp16")];
+            tensor<int32, [5]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_84_axis_0 = const()[name = string("op_84_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_84_cast_fp16_4 = split(axis = var_84_axis_0, split_sizes = tile_0, x = key_cache)[name = string("op_84_cast_fp16")];
+            tensor<int32, [5]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_92_axis_0 = const()[name = string("op_92_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_92_cast_fp16_4 = split(axis = var_92_axis_0, split_sizes = tile_1, x = value_cache)[name = string("op_92_cast_fp16")];
+            int32 var_99 = const()[name = string("op_99"), val = int32(3)];
+            int32 var_109 = const()[name = string("op_109"), val = int32(-2)];
+            int32 var_117 = const()[name = string("op_117"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = input_embeds, y = input_embeds)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_129_to_fp16 = const()[name = string("op_129_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_130_cast_fp16 = add(x = variance_1_cast_fp16, y = var_129_to_fp16)[name = string("op_130_cast_fp16")];
+            fp32 var_131_epsilon_0 = const()[name = string("op_131_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_131_cast_fp16 = rsqrt(epsilon = var_131_epsilon_0, x = var_130_cast_fp16)[name = string("op_131_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_1_cast_fp16 = mul(x = input_embeds, y = var_131_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8384)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_1_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("obj_1_cast_fp16")];
+            string query_1_pad_type_0 = const()[name = string("query_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_1_strides_0 = const()[name = string("query_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_1_pad_0 = const()[name = string("query_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_1_dilations_0 = const()[name = string("query_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_1_groups_0 = const()[name = string("query_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2107712))))[name = string("layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> layers_0_self_attn_q_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_q_proj_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2108288)))];
+            tensor<fp16, [1, 2048, 1, 1]> query_1_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_1_dilations_0, groups = query_1_groups_0, pad = query_1_pad_0, pad_type = query_1_pad_type_0, strides = query_1_strides_0, weight = layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("query_1_cast_fp16")];
+            string current_key_1_pad_type_0 = const()[name = string("current_key_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_1_strides_0 = const()[name = string("current_key_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_1_pad_0 = const()[name = string("current_key_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_1_dilations_0 = const()[name = string("current_key_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_1_groups_0 = const()[name = string("current_key_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2112448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3161088))))[name = string("layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = conv(dilations = current_key_1_dilations_0, groups = current_key_1_groups_0, pad = current_key_1_pad_0, pad_type = current_key_1_pad_type_0, strides = current_key_1_strides_0, weight = layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            string current_value_1_pad_type_0 = const()[name = string("current_value_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_1_strides_0 = const()[name = string("current_value_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_1_pad_0 = const()[name = string("current_value_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_1_dilations_0 = const()[name = string("current_value_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_1_groups_0 = const()[name = string("current_value_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3161664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4210304))))[name = string("layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> layers_0_self_attn_v_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_v_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4210880)))];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_1_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_1_dilations_0, groups = current_value_1_groups_0, pad = current_value_1_pad_0, pad_type = current_value_1_pad_type_0, strides = current_value_1_strides_0, weight = layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_value_1_cast_fp16")];
+            tensor<int32, [4]> var_168 = const()[name = string("op_168"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_1_cast_fp16 = reshape(shape = var_168, x = query_1_cast_fp16)[name = string("inputs_1_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_174_to_fp16 = const()[name = string("op_174_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_175_cast_fp16 = add(x = variance_3_cast_fp16, y = var_174_to_fp16)[name = string("op_175_cast_fp16")];
+            fp32 var_176_epsilon_0 = const()[name = string("op_176_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_176_cast_fp16 = rsqrt(epsilon = var_176_epsilon_0, x = var_175_cast_fp16)[name = string("op_176_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_176_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4212992)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_1_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("query_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_184 = const()[name = string("op_184"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_3_cast_fp16 = reshape(shape = var_184, x = current_key_1_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_190_to_fp16 = const()[name = string("op_190_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_191_cast_fp16 = add(x = variance_5_cast_fp16, y = var_190_to_fp16)[name = string("op_191_cast_fp16")];
+            fp32 var_192_epsilon_0 = const()[name = string("op_192_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_192_cast_fp16 = rsqrt(epsilon = var_192_epsilon_0, x = var_191_cast_fp16)[name = string("op_192_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_192_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4213312)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_1_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("current_key_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_210 = const()[name = string("op_210"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_1_cast_fp16 = reshape(shape = var_210, x = query_normed_1_cast_fp16)[name = string("mh_q_1_cast_fp16")];
+            tensor<int32, [4]> var_212 = const()[name = string("op_212"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_1_cast_fp16 = reshape(shape = var_212, x = current_key_normed_1_cast_fp16)[name = string("mh_k_1_cast_fp16")];
+            tensor<int32, [1]> cos_1_axes_0 = const()[name = string("cos_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> cos_1_cast_fp16 = expand_dims(axes = cos_1_axes_0, x = obj_7_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [1]> sin_1_axes_0 = const()[name = string("sin_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> sin_1_cast_fp16 = expand_dims(axes = sin_1_axes_0, x = obj_9_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_216_cast_fp16 = mul(x = mh_q_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_216_cast_fp16")];
+            tensor<int32, [4]> var_221_begin_0 = const()[name = string("op_221_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_221_end_0 = const()[name = string("op_221_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_221_end_mask_0 = const()[name = string("op_221_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_221_cast_fp16 = slice_by_index(begin = var_221_begin_0, end = var_221_end_0, end_mask = var_221_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_221_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = string("op_227_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = string("op_227_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = string("op_227_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_227_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_229_cast_fp16 = mul(x = var_227_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_229_cast_fp16")];
+            bool var_231_interleave_0 = const()[name = string("op_231_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_231_cast_fp16 = concat(axis = var_109, interleave = var_231_interleave_0, values = (var_229_cast_fp16, var_221_cast_fp16))[name = string("op_231_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_232_cast_fp16 = mul(x = var_231_cast_fp16, y = sin_1_cast_fp16)[name = string("op_232_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_3_cast_fp16 = add(x = var_216_cast_fp16, y = var_232_cast_fp16)[name = string("mh_q_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_234_cast_fp16 = mul(x = mh_k_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_234_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = string("op_239_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = string("op_239_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = string("op_239_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_239_cast_fp16")];
+            tensor<int32, [4]> var_245_begin_0 = const()[name = string("op_245_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_245_end_0 = const()[name = string("op_245_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_245_end_mask_0 = const()[name = string("op_245_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_245_cast_fp16 = slice_by_index(begin = var_245_begin_0, end = var_245_end_0, end_mask = var_245_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_245_cast_fp16")];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_247_cast_fp16 = mul(x = var_245_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_247_cast_fp16")];
+            bool var_249_interleave_0 = const()[name = string("op_249_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_249_cast_fp16 = concat(axis = var_109, interleave = var_249_interleave_0, values = (var_247_cast_fp16, var_239_cast_fp16))[name = string("op_249_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_250_cast_fp16 = mul(x = var_249_cast_fp16, y = sin_1_cast_fp16)[name = string("op_250_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_3_cast_fp16 = add(x = var_234_cast_fp16, y = var_250_cast_fp16)[name = string("mh_k_3_cast_fp16")];
+            tensor<int32, [4]> var_254 = const()[name = string("op_254"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_254, x = mh_k_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<int32, [1]> var_257_axes_0 = const()[name = string("op_257_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_257_cast_fp16 = expand_dims(axes = var_257_axes_0, x = kv_cache_update_mask)[name = string("op_257_cast_fp16")];
+            tensor<int32, [1]> var_258_axes_0 = const()[name = string("op_258_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_258_cast_fp16 = expand_dims(axes = var_258_axes_0, x = var_257_cast_fp16)[name = string("op_258_cast_fp16")];
+            fp16 var_110_to_fp16 = const()[name = string("op_110_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 16]> var_260_cast_fp16 = sub(x = var_110_to_fp16, y = var_258_cast_fp16)[name = string("op_260_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_261_cast_fp16 = mul(x = var_84_cast_fp16_0, y = var_260_cast_fp16)[name = string("op_261_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_262_cast_fp16 = mul(x = current_key_3_cast_fp16, y = var_258_cast_fp16)[name = string("op_262_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_3_cast_fp16 = add(x = var_261_cast_fp16, y = var_262_cast_fp16)[name = string("key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_265_cast_fp16 = mul(x = var_92_cast_fp16_0, y = var_260_cast_fp16)[name = string("op_265_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_266_cast_fp16 = mul(x = current_value_1_cast_fp16, y = var_258_cast_fp16)[name = string("op_266_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_1_cast_fp16 = add(x = var_265_cast_fp16, y = var_266_cast_fp16)[name = string("value_1_cast_fp16")];
+            tensor<int32, [4]> var_270 = const()[name = string("op_270"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_1_cast_fp16 = reshape(shape = var_270, x = key_3_cast_fp16)[name = string("key_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_272 = const()[name = string("op_272"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_1_cast_fp16 = reshape(shape = var_272, x = value_1_cast_fp16)[name = string("value_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = string("op_275_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = string("op_275_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = string("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = string("op_279_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = string("op_279_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = string("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_279_cast_fp16")];
+            tensor<int32, [4]> var_291_begin_0 = const()[name = string("op_291_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_291_end_0 = const()[name = string("op_291_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_291_end_mask_0 = const()[name = string("op_291_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_291_cast_fp16 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_291_cast_fp16")];
+            tensor<int32, [4]> var_295_begin_0 = const()[name = string("op_295_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_295_end_0 = const()[name = string("op_295_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_295_end_mask_0 = const()[name = string("op_295_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_295_cast_fp16 = slice_by_index(begin = var_295_begin_0, end = var_295_end_0, end_mask = var_295_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_295_cast_fp16")];
+            tensor<int32, [4]> var_307_begin_0 = const()[name = string("op_307_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_307_end_0 = const()[name = string("op_307_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_307_end_mask_0 = const()[name = string("op_307_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_307_cast_fp16 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_307_cast_fp16")];
+            tensor<int32, [4]> var_311_begin_0 = const()[name = string("op_311_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_311_end_0 = const()[name = string("op_311_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_311_end_mask_0 = const()[name = string("op_311_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_311_cast_fp16 = slice_by_index(begin = var_311_begin_0, end = var_311_end_0, end_mask = var_311_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_311_cast_fp16")];
+            tensor<int32, [4]> var_323_begin_0 = const()[name = string("op_323_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_323_end_0 = const()[name = string("op_323_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_323_end_mask_0 = const()[name = string("op_323_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_323_cast_fp16 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_323_cast_fp16")];
+            tensor<int32, [4]> var_327_begin_0 = const()[name = string("op_327_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_327_end_0 = const()[name = string("op_327_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_327_end_mask_0 = const()[name = string("op_327_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_327_cast_fp16 = slice_by_index(begin = var_327_begin_0, end = var_327_end_0, end_mask = var_327_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_327_cast_fp16")];
+            tensor<int32, [4]> var_339_begin_0 = const()[name = string("op_339_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_339_end_0 = const()[name = string("op_339_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_339_end_mask_0 = const()[name = string("op_339_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_339_cast_fp16 = slice_by_index(begin = var_339_begin_0, end = var_339_end_0, end_mask = var_339_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_339_cast_fp16")];
+            tensor<int32, [4]> var_343_begin_0 = const()[name = string("op_343_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_343_end_0 = const()[name = string("op_343_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_343_end_mask_0 = const()[name = string("op_343_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_343_cast_fp16 = slice_by_index(begin = var_343_begin_0, end = var_343_end_0, end_mask = var_343_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_343_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_355_cast_fp16")];
+            tensor<int32, [4]> var_359_begin_0 = const()[name = string("op_359_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_359_end_0 = const()[name = string("op_359_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_359_end_mask_0 = const()[name = string("op_359_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_359_cast_fp16 = slice_by_index(begin = var_359_begin_0, end = var_359_end_0, end_mask = var_359_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_359_cast_fp16")];
+            tensor<int32, [4]> var_371_begin_0 = const()[name = string("op_371_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_371_end_0 = const()[name = string("op_371_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_371_end_mask_0 = const()[name = string("op_371_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_371_cast_fp16 = slice_by_index(begin = var_371_begin_0, end = var_371_end_0, end_mask = var_371_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_371_cast_fp16")];
+            tensor<int32, [4]> var_375_begin_0 = const()[name = string("op_375_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_375_end_0 = const()[name = string("op_375_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_375_end_mask_0 = const()[name = string("op_375_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_375_cast_fp16 = slice_by_index(begin = var_375_begin_0, end = var_375_end_0, end_mask = var_375_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_375_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_387_cast_fp16 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_387_cast_fp16")];
+            tensor<int32, [4]> var_391_begin_0 = const()[name = string("op_391_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_391_end_0 = const()[name = string("op_391_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_391_end_mask_0 = const()[name = string("op_391_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_391_cast_fp16 = slice_by_index(begin = var_391_begin_0, end = var_391_end_0, end_mask = var_391_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_391_cast_fp16")];
+            bool key_heads_3_interleave_0 = const()[name = string("key_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_3_cast_fp16 = concat(axis = var_117, interleave = key_heads_3_interleave_0, values = (var_275_cast_fp16, var_275_cast_fp16, var_291_cast_fp16, var_291_cast_fp16, var_307_cast_fp16, var_307_cast_fp16, var_323_cast_fp16, var_323_cast_fp16, var_339_cast_fp16, var_339_cast_fp16, var_355_cast_fp16, var_355_cast_fp16, var_371_cast_fp16, var_371_cast_fp16, var_387_cast_fp16, var_387_cast_fp16))[name = string("key_heads_3_cast_fp16")];
+            bool value_heads_3_interleave_0 = const()[name = string("value_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_3_cast_fp16 = concat(axis = var_117, interleave = value_heads_3_interleave_0, values = (var_279_cast_fp16, var_279_cast_fp16, var_295_cast_fp16, var_295_cast_fp16, var_311_cast_fp16, var_311_cast_fp16, var_327_cast_fp16, var_327_cast_fp16, var_343_cast_fp16, var_343_cast_fp16, var_359_cast_fp16, var_359_cast_fp16, var_375_cast_fp16, var_375_cast_fp16, var_391_cast_fp16, var_391_cast_fp16))[name = string("value_heads_3_cast_fp16")];
+            fp16 var_414_to_fp16 = const()[name = string("op_414_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_415_cast_fp16 = mul(x = mh_q_3_cast_fp16, y = var_414_to_fp16)[name = string("op_415_cast_fp16")];
+            bool mh_w_1_transpose_x_0 = const()[name = string("mh_w_1_transpose_x_0"), val = bool(true)];
+            bool mh_w_1_transpose_y_0 = const()[name = string("mh_w_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_1_cast_fp16 = matmul(transpose_x = mh_w_1_transpose_x_0, transpose_y = mh_w_1_transpose_y_0, x = var_415_cast_fp16, y = key_heads_3_cast_fp16)[name = string("mh_w_1_cast_fp16")];
+            tensor<int32, [1]> var_423_axes_0 = const()[name = string("op_423_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_423_cast_fp16 = expand_dims(axes = var_423_axes_0, x = key_padding_mask)[name = string("op_423_cast_fp16")];
+            tensor<int32, [1]> var_424_axes_0 = const()[name = string("op_424_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_424_cast_fp16 = expand_dims(axes = var_424_axes_0, x = var_423_cast_fp16)[name = string("op_424_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_3_cast_fp16 = add(x = mh_w_1_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_3_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_427_cast_fp16 = softmax(axis = var_99, x = mh_w_3_cast_fp16)[name = string("op_427_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = value_heads_3_cast_fp16, y = var_427_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_432 = const()[name = string("op_432"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = reshape(shape = var_432, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            string obj_11_pad_type_0 = const()[name = string("obj_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_11_strides_0 = const()[name = string("obj_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_11_pad_0 = const()[name = string("obj_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_11_dilations_0 = const()[name = string("obj_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_11_groups_0 = const()[name = string("obj_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4213632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6310848))))[name = string("layers_0_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_11_dilations_0, groups = obj_11_groups_0, pad = obj_11_pad_0, pad_type = obj_11_pad_type_0, strides = obj_11_strides_0, weight = layers_0_self_attn_o_proj_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("obj_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_5_cast_fp16 = add(x = input_embeds, y = obj_11_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_450_to_fp16 = const()[name = string("op_450_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_451_cast_fp16 = add(x = variance_7_cast_fp16, y = var_450_to_fp16)[name = string("op_451_cast_fp16")];
+            fp32 var_452_epsilon_0 = const()[name = string("op_452_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_452_cast_fp16 = rsqrt(epsilon = var_452_epsilon_0, x = var_451_cast_fp16)[name = string("op_452_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_452_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6311424)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_3_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_3_cast_fp16")];
+            string input_5_pad_type_0 = const()[name = string("input_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_5_strides_0 = const()[name = string("input_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_5_pad_0 = const()[name = string("input_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_5_dilations_0 = const()[name = string("input_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_5_groups_0 = const()[name = string("input_5_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6313536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9459328))))[name = string("layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_5_cast_fp16 = conv(dilations = input_5_dilations_0, groups = input_5_groups_0, pad = input_5_pad_0, pad_type = input_5_pad_type_0, strides = input_5_strides_0, weight = layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_466_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_466_cast_fp16")];
+            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_472_strides_0 = const()[name = string("op_472_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_472_dilations_0 = const()[name = string("op_472_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_472_groups_0 = const()[name = string("op_472_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9459904))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12605696))))[name = string("layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_472_cast_fp16 = conv(dilations = var_472_dilations_0, groups = var_472_groups_0, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_472_strides_0, weight = layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_7_cast_fp16 = mul(x = var_466_cast_fp16, y = var_472_cast_fp16)[name = string("input_7_cast_fp16")];
+            string hidden_states_9_pad_type_0 = const()[name = string("hidden_states_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_9_strides_0 = const()[name = string("hidden_states_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_9_pad_0 = const()[name = string("hidden_states_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_9_dilations_0 = const()[name = string("hidden_states_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_9_groups_0 = const()[name = string("hidden_states_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_0_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12606272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15752064))))[name = string("layers_0_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_9_cast_fp16 = conv(dilations = hidden_states_9_dilations_0, groups = hidden_states_9_groups_0, pad = hidden_states_9_pad_0, pad_type = hidden_states_9_pad_type_0, strides = hidden_states_9_strides_0, weight = layers_0_mlp_down_proj_weight_to_fp16_palettized, x = input_7_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            int32 var_486 = const()[name = string("op_486"), val = int32(3)];
+            int32 var_496 = const()[name = string("op_496"), val = int32(-2)];
+            int32 var_504 = const()[name = string("op_504"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_516_to_fp16 = const()[name = string("op_516_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_517_cast_fp16 = add(x = variance_9_cast_fp16, y = var_516_to_fp16)[name = string("op_517_cast_fp16")];
+            fp32 var_518_epsilon_0 = const()[name = string("op_518_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_518_cast_fp16 = rsqrt(epsilon = var_518_epsilon_0, x = var_517_cast_fp16)[name = string("op_518_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_518_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15752640)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_13_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_11_cast_fp16)[name = string("obj_13_cast_fp16")];
+            string query_7_pad_type_0 = const()[name = string("query_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_7_strides_0 = const()[name = string("query_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_7_pad_0 = const()[name = string("query_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_7_dilations_0 = const()[name = string("query_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_7_groups_0 = const()[name = string("query_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15754752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17851968))))[name = string("layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_7_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_7_dilations_0, groups = query_7_groups_0, pad = query_7_pad_0, pad_type = query_7_pad_type_0, strides = query_7_strides_0, weight = layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("query_7_cast_fp16")];
+            string current_key_5_pad_type_0 = const()[name = string("current_key_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_5_strides_0 = const()[name = string("current_key_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_5_pad_0 = const()[name = string("current_key_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_5_dilations_0 = const()[name = string("current_key_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_5_groups_0 = const()[name = string("current_key_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17852544))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18901184))))[name = string("layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = conv(dilations = current_key_5_dilations_0, groups = current_key_5_groups_0, pad = current_key_5_pad_0, pad_type = current_key_5_pad_type_0, strides = current_key_5_strides_0, weight = layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            string current_value_3_pad_type_0 = const()[name = string("current_value_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_3_strides_0 = const()[name = string("current_value_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_3_pad_0 = const()[name = string("current_value_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_3_dilations_0 = const()[name = string("current_value_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_3_groups_0 = const()[name = string("current_value_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18901760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19950400))))[name = string("layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_3_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_3_dilations_0, groups = current_value_3_groups_0, pad = current_value_3_pad_0, pad_type = current_value_3_pad_type_0, strides = current_value_3_strides_0, weight = layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_value_3_cast_fp16")];
+            tensor<int32, [4]> var_555 = const()[name = string("op_555"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_9_cast_fp16 = reshape(shape = var_555, x = query_7_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_561_to_fp16 = const()[name = string("op_561_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_562_cast_fp16 = add(x = variance_11_cast_fp16, y = var_561_to_fp16)[name = string("op_562_cast_fp16")];
+            fp32 var_563_epsilon_0 = const()[name = string("op_563_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_563_cast_fp16 = rsqrt(epsilon = var_563_epsilon_0, x = var_562_cast_fp16)[name = string("op_563_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_563_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19950976)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_3_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_13_cast_fp16)[name = string("query_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_571 = const()[name = string("op_571"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_11_cast_fp16 = reshape(shape = var_571, x = current_key_5_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_577_to_fp16 = const()[name = string("op_577_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_578_cast_fp16 = add(x = variance_13_cast_fp16, y = var_577_to_fp16)[name = string("op_578_cast_fp16")];
+            fp32 var_579_epsilon_0 = const()[name = string("op_579_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_579_cast_fp16 = rsqrt(epsilon = var_579_epsilon_0, x = var_578_cast_fp16)[name = string("op_579_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_579_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19951296)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_3_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_15_cast_fp16)[name = string("current_key_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_597 = const()[name = string("op_597"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_7_cast_fp16 = reshape(shape = var_597, x = query_normed_3_cast_fp16)[name = string("mh_q_7_cast_fp16")];
+            tensor<int32, [4]> var_599 = const()[name = string("op_599"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_5_cast_fp16 = reshape(shape = var_599, x = current_key_normed_3_cast_fp16)[name = string("mh_k_5_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_603_cast_fp16 = mul(x = mh_q_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_603_cast_fp16")];
+            tensor<int32, [4]> var_608_begin_0 = const()[name = string("op_608_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_608_end_0 = const()[name = string("op_608_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_608_end_mask_0 = const()[name = string("op_608_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_608_cast_fp16 = slice_by_index(begin = var_608_begin_0, end = var_608_end_0, end_mask = var_608_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_608_cast_fp16")];
+            tensor<int32, [4]> var_614_begin_0 = const()[name = string("op_614_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_614_end_0 = const()[name = string("op_614_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_614_end_mask_0 = const()[name = string("op_614_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_614_cast_fp16 = slice_by_index(begin = var_614_begin_0, end = var_614_end_0, end_mask = var_614_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_614_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_616_cast_fp16 = mul(x = var_614_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_616_cast_fp16")];
+            bool var_618_interleave_0 = const()[name = string("op_618_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_618_cast_fp16 = concat(axis = var_496, interleave = var_618_interleave_0, values = (var_616_cast_fp16, var_608_cast_fp16))[name = string("op_618_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_619_cast_fp16 = mul(x = var_618_cast_fp16, y = sin_1_cast_fp16)[name = string("op_619_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_9_cast_fp16 = add(x = var_603_cast_fp16, y = var_619_cast_fp16)[name = string("mh_q_9_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_621_cast_fp16 = mul(x = mh_k_5_cast_fp16, y = cos_1_cast_fp16)[name = string("op_621_cast_fp16")];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_626_cast_fp16 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_632_begin_0 = const()[name = string("op_632_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_632_end_0 = const()[name = string("op_632_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_632_end_mask_0 = const()[name = string("op_632_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_632_cast_fp16 = slice_by_index(begin = var_632_begin_0, end = var_632_end_0, end_mask = var_632_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_632_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_634_cast_fp16 = mul(x = var_632_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_634_cast_fp16")];
+            bool var_636_interleave_0 = const()[name = string("op_636_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_636_cast_fp16 = concat(axis = var_496, interleave = var_636_interleave_0, values = (var_634_cast_fp16, var_626_cast_fp16))[name = string("op_636_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_637_cast_fp16 = mul(x = var_636_cast_fp16, y = sin_1_cast_fp16)[name = string("op_637_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_7_cast_fp16 = add(x = var_621_cast_fp16, y = var_637_cast_fp16)[name = string("mh_k_7_cast_fp16")];
+            tensor<int32, [4]> var_641 = const()[name = string("op_641"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_641, x = mh_k_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_648_cast_fp16 = mul(x = var_84_cast_fp16_1, y = var_260_cast_fp16)[name = string("op_648_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_649_cast_fp16 = mul(x = current_key_7_cast_fp16, y = var_258_cast_fp16)[name = string("op_649_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_9_cast_fp16 = add(x = var_648_cast_fp16, y = var_649_cast_fp16)[name = string("key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_652_cast_fp16 = mul(x = var_92_cast_fp16_1, y = var_260_cast_fp16)[name = string("op_652_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_653_cast_fp16 = mul(x = current_value_3_cast_fp16, y = var_258_cast_fp16)[name = string("op_653_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_5_cast_fp16 = add(x = var_652_cast_fp16, y = var_653_cast_fp16)[name = string("value_5_cast_fp16")];
+            tensor<int32, [4]> var_657 = const()[name = string("op_657"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_5_cast_fp16 = reshape(shape = var_657, x = key_9_cast_fp16)[name = string("key_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_659 = const()[name = string("op_659"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_5_cast_fp16 = reshape(shape = var_659, x = value_5_cast_fp16)[name = string("value_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_662_begin_0 = const()[name = string("op_662_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_662_end_0 = const()[name = string("op_662_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_662_end_mask_0 = const()[name = string("op_662_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_662_cast_fp16 = slice_by_index(begin = var_662_begin_0, end = var_662_end_0, end_mask = var_662_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_662_cast_fp16")];
+            tensor<int32, [4]> var_666_begin_0 = const()[name = string("op_666_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_666_end_0 = const()[name = string("op_666_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_666_end_mask_0 = const()[name = string("op_666_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_666_cast_fp16 = slice_by_index(begin = var_666_begin_0, end = var_666_end_0, end_mask = var_666_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_666_cast_fp16")];
+            tensor<int32, [4]> var_678_begin_0 = const()[name = string("op_678_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_678_end_0 = const()[name = string("op_678_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_678_end_mask_0 = const()[name = string("op_678_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_678_cast_fp16 = slice_by_index(begin = var_678_begin_0, end = var_678_end_0, end_mask = var_678_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_678_cast_fp16")];
+            tensor<int32, [4]> var_682_begin_0 = const()[name = string("op_682_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_682_end_0 = const()[name = string("op_682_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_682_end_mask_0 = const()[name = string("op_682_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_682_cast_fp16 = slice_by_index(begin = var_682_begin_0, end = var_682_end_0, end_mask = var_682_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_682_cast_fp16")];
+            tensor<int32, [4]> var_694_begin_0 = const()[name = string("op_694_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_694_end_0 = const()[name = string("op_694_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_694_end_mask_0 = const()[name = string("op_694_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_694_cast_fp16 = slice_by_index(begin = var_694_begin_0, end = var_694_end_0, end_mask = var_694_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_694_cast_fp16")];
+            tensor<int32, [4]> var_698_begin_0 = const()[name = string("op_698_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_698_end_0 = const()[name = string("op_698_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_698_end_mask_0 = const()[name = string("op_698_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_698_cast_fp16 = slice_by_index(begin = var_698_begin_0, end = var_698_end_0, end_mask = var_698_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_698_cast_fp16")];
+            tensor<int32, [4]> var_710_begin_0 = const()[name = string("op_710_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_710_end_0 = const()[name = string("op_710_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_710_end_mask_0 = const()[name = string("op_710_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_710_cast_fp16 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_710_cast_fp16")];
+            tensor<int32, [4]> var_714_begin_0 = const()[name = string("op_714_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_714_end_0 = const()[name = string("op_714_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_714_end_mask_0 = const()[name = string("op_714_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_714_cast_fp16 = slice_by_index(begin = var_714_begin_0, end = var_714_end_0, end_mask = var_714_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_714_cast_fp16")];
+            tensor<int32, [4]> var_726_begin_0 = const()[name = string("op_726_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_726_end_0 = const()[name = string("op_726_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_726_end_mask_0 = const()[name = string("op_726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_726_cast_fp16 = slice_by_index(begin = var_726_begin_0, end = var_726_end_0, end_mask = var_726_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_726_cast_fp16")];
+            tensor<int32, [4]> var_730_begin_0 = const()[name = string("op_730_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_730_end_0 = const()[name = string("op_730_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_730_end_mask_0 = const()[name = string("op_730_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_730_cast_fp16 = slice_by_index(begin = var_730_begin_0, end = var_730_end_0, end_mask = var_730_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_730_cast_fp16")];
+            tensor<int32, [4]> var_742_begin_0 = const()[name = string("op_742_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_742_end_0 = const()[name = string("op_742_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_742_end_mask_0 = const()[name = string("op_742_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_742_cast_fp16 = slice_by_index(begin = var_742_begin_0, end = var_742_end_0, end_mask = var_742_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_742_cast_fp16")];
+            tensor<int32, [4]> var_746_begin_0 = const()[name = string("op_746_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_746_end_0 = const()[name = string("op_746_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_746_end_mask_0 = const()[name = string("op_746_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_746_cast_fp16 = slice_by_index(begin = var_746_begin_0, end = var_746_end_0, end_mask = var_746_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_746_cast_fp16")];
+            tensor<int32, [4]> var_758_begin_0 = const()[name = string("op_758_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_758_end_0 = const()[name = string("op_758_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_758_end_mask_0 = const()[name = string("op_758_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_758_cast_fp16 = slice_by_index(begin = var_758_begin_0, end = var_758_end_0, end_mask = var_758_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_758_cast_fp16")];
+            tensor<int32, [4]> var_762_begin_0 = const()[name = string("op_762_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_762_end_0 = const()[name = string("op_762_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_762_end_mask_0 = const()[name = string("op_762_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_762_cast_fp16 = slice_by_index(begin = var_762_begin_0, end = var_762_end_0, end_mask = var_762_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_762_cast_fp16")];
+            tensor<int32, [4]> var_774_begin_0 = const()[name = string("op_774_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_774_end_0 = const()[name = string("op_774_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_774_end_mask_0 = const()[name = string("op_774_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_774_cast_fp16 = slice_by_index(begin = var_774_begin_0, end = var_774_end_0, end_mask = var_774_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_774_cast_fp16")];
+            tensor<int32, [4]> var_778_begin_0 = const()[name = string("op_778_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_778_end_0 = const()[name = string("op_778_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_778_end_mask_0 = const()[name = string("op_778_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_778_cast_fp16 = slice_by_index(begin = var_778_begin_0, end = var_778_end_0, end_mask = var_778_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_778_cast_fp16")];
+            bool key_heads_7_interleave_0 = const()[name = string("key_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_7_cast_fp16 = concat(axis = var_504, interleave = key_heads_7_interleave_0, values = (var_662_cast_fp16, var_662_cast_fp16, var_678_cast_fp16, var_678_cast_fp16, var_694_cast_fp16, var_694_cast_fp16, var_710_cast_fp16, var_710_cast_fp16, var_726_cast_fp16, var_726_cast_fp16, var_742_cast_fp16, var_742_cast_fp16, var_758_cast_fp16, var_758_cast_fp16, var_774_cast_fp16, var_774_cast_fp16))[name = string("key_heads_7_cast_fp16")];
+            bool value_heads_7_interleave_0 = const()[name = string("value_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_7_cast_fp16 = concat(axis = var_504, interleave = value_heads_7_interleave_0, values = (var_666_cast_fp16, var_666_cast_fp16, var_682_cast_fp16, var_682_cast_fp16, var_698_cast_fp16, var_698_cast_fp16, var_714_cast_fp16, var_714_cast_fp16, var_730_cast_fp16, var_730_cast_fp16, var_746_cast_fp16, var_746_cast_fp16, var_762_cast_fp16, var_762_cast_fp16, var_778_cast_fp16, var_778_cast_fp16))[name = string("value_heads_7_cast_fp16")];
+            fp16 var_801_to_fp16 = const()[name = string("op_801_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_802_cast_fp16 = mul(x = mh_q_9_cast_fp16, y = var_801_to_fp16)[name = string("op_802_cast_fp16")];
+            bool mh_w_5_transpose_x_0 = const()[name = string("mh_w_5_transpose_x_0"), val = bool(true)];
+            bool mh_w_5_transpose_y_0 = const()[name = string("mh_w_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_5_cast_fp16 = matmul(transpose_x = mh_w_5_transpose_x_0, transpose_y = mh_w_5_transpose_y_0, x = var_802_cast_fp16, y = key_heads_7_cast_fp16)[name = string("mh_w_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_7_cast_fp16 = add(x = mh_w_5_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_7_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_814_cast_fp16 = softmax(axis = var_486, x = mh_w_7_cast_fp16)[name = string("op_814_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = value_heads_7_cast_fp16, y = var_814_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_819 = const()[name = string("op_819"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_9_cast_fp16 = reshape(shape = var_819, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            string obj_19_pad_type_0 = const()[name = string("obj_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_19_strides_0 = const()[name = string("obj_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_19_pad_0 = const()[name = string("obj_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_19_dilations_0 = const()[name = string("obj_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_19_groups_0 = const()[name = string("obj_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19951616))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22048832))))[name = string("layers_1_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_19_dilations_0, groups = obj_19_groups_0, pad = obj_19_pad_0, pad_type = obj_19_pad_type_0, strides = obj_19_strides_0, weight = layers_1_self_attn_o_proj_weight_to_fp16_palettized, x = input_9_cast_fp16)[name = string("obj_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_13_cast_fp16 = add(x = inputs_7_cast_fp16, y = obj_19_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_837_to_fp16 = const()[name = string("op_837_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_838_cast_fp16 = add(x = variance_15_cast_fp16, y = var_837_to_fp16)[name = string("op_838_cast_fp16")];
+            fp32 var_839_epsilon_0 = const()[name = string("op_839_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_839_cast_fp16 = rsqrt(epsilon = var_839_epsilon_0, x = var_838_cast_fp16)[name = string("op_839_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_839_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22049408)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_11_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_17_cast_fp16)[name = string("input_11_cast_fp16")];
+            string input_13_pad_type_0 = const()[name = string("input_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_13_strides_0 = const()[name = string("input_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_13_pad_0 = const()[name = string("input_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_13_dilations_0 = const()[name = string("input_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_13_groups_0 = const()[name = string("input_13_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22051520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25197312))))[name = string("layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_13_cast_fp16 = conv(dilations = input_13_dilations_0, groups = input_13_groups_0, pad = input_13_pad_0, pad_type = input_13_pad_type_0, strides = input_13_strides_0, weight = layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_853_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_853_cast_fp16")];
+            string var_859_pad_type_0 = const()[name = string("op_859_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_859_strides_0 = const()[name = string("op_859_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_859_pad_0 = const()[name = string("op_859_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_859_dilations_0 = const()[name = string("op_859_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_859_groups_0 = const()[name = string("op_859_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25197888))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28343680))))[name = string("layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_859_cast_fp16 = conv(dilations = var_859_dilations_0, groups = var_859_groups_0, pad = var_859_pad_0, pad_type = var_859_pad_type_0, strides = var_859_strides_0, weight = layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("op_859_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_15_cast_fp16 = mul(x = var_853_cast_fp16, y = var_859_cast_fp16)[name = string("input_15_cast_fp16")];
+            string hidden_states_19_pad_type_0 = const()[name = string("hidden_states_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_19_strides_0 = const()[name = string("hidden_states_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_19_pad_0 = const()[name = string("hidden_states_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_19_dilations_0 = const()[name = string("hidden_states_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_19_groups_0 = const()[name = string("hidden_states_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_1_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28344256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31490048))))[name = string("layers_1_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_19_cast_fp16 = conv(dilations = hidden_states_19_dilations_0, groups = hidden_states_19_groups_0, pad = hidden_states_19_pad_0, pad_type = hidden_states_19_pad_type_0, strides = hidden_states_19_strides_0, weight = layers_1_mlp_down_proj_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            int32 var_873 = const()[name = string("op_873"), val = int32(3)];
+            int32 var_883 = const()[name = string("op_883"), val = int32(-2)];
+            int32 var_891 = const()[name = string("op_891"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_903_to_fp16 = const()[name = string("op_903_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_904_cast_fp16 = add(x = variance_17_cast_fp16, y = var_903_to_fp16)[name = string("op_904_cast_fp16")];
+            fp32 var_905_epsilon_0 = const()[name = string("op_905_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_905_cast_fp16 = rsqrt(epsilon = var_905_epsilon_0, x = var_904_cast_fp16)[name = string("op_905_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_905_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31490624)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_21_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_21_cast_fp16)[name = string("obj_21_cast_fp16")];
+            string query_13_pad_type_0 = const()[name = string("query_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_13_strides_0 = const()[name = string("query_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_13_pad_0 = const()[name = string("query_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_13_dilations_0 = const()[name = string("query_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_13_groups_0 = const()[name = string("query_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31492736))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33589952))))[name = string("layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_13_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_13_dilations_0, groups = query_13_groups_0, pad = query_13_pad_0, pad_type = query_13_pad_type_0, strides = query_13_strides_0, weight = layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("query_13_cast_fp16")];
+            string current_key_9_pad_type_0 = const()[name = string("current_key_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_9_strides_0 = const()[name = string("current_key_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_9_pad_0 = const()[name = string("current_key_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_9_dilations_0 = const()[name = string("current_key_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_9_groups_0 = const()[name = string("current_key_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33590528))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34639168))))[name = string("layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = conv(dilations = current_key_9_dilations_0, groups = current_key_9_groups_0, pad = current_key_9_pad_0, pad_type = current_key_9_pad_type_0, strides = current_key_9_strides_0, weight = layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            string current_value_5_pad_type_0 = const()[name = string("current_value_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_5_strides_0 = const()[name = string("current_value_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_5_pad_0 = const()[name = string("current_value_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_5_dilations_0 = const()[name = string("current_value_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_5_groups_0 = const()[name = string("current_value_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34639744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35688384))))[name = string("layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_5_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_5_dilations_0, groups = current_value_5_groups_0, pad = current_value_5_pad_0, pad_type = current_value_5_pad_type_0, strides = current_value_5_strides_0, weight = layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_value_5_cast_fp16")];
+            tensor<int32, [4]> var_942 = const()[name = string("op_942"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_17_cast_fp16 = reshape(shape = var_942, x = query_13_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_948_to_fp16 = const()[name = string("op_948_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_949_cast_fp16 = add(x = variance_19_cast_fp16, y = var_948_to_fp16)[name = string("op_949_cast_fp16")];
+            fp32 var_950_epsilon_0 = const()[name = string("op_950_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_950_cast_fp16 = rsqrt(epsilon = var_950_epsilon_0, x = var_949_cast_fp16)[name = string("op_950_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_950_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35688960)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_5_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_23_cast_fp16)[name = string("query_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_958 = const()[name = string("op_958"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_19_cast_fp16 = reshape(shape = var_958, x = current_key_9_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_964_to_fp16 = const()[name = string("op_964_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_965_cast_fp16 = add(x = variance_21_cast_fp16, y = var_964_to_fp16)[name = string("op_965_cast_fp16")];
+            fp32 var_966_epsilon_0 = const()[name = string("op_966_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_966_cast_fp16 = rsqrt(epsilon = var_966_epsilon_0, x = var_965_cast_fp16)[name = string("op_966_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_966_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35689280)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_5_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_25_cast_fp16)[name = string("current_key_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_984 = const()[name = string("op_984"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_13_cast_fp16 = reshape(shape = var_984, x = query_normed_5_cast_fp16)[name = string("mh_q_13_cast_fp16")];
+            tensor<int32, [4]> var_986 = const()[name = string("op_986"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_9_cast_fp16 = reshape(shape = var_986, x = current_key_normed_5_cast_fp16)[name = string("mh_k_9_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_990_cast_fp16 = mul(x = mh_q_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_990_cast_fp16")];
+            tensor<int32, [4]> var_995_begin_0 = const()[name = string("op_995_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_995_end_0 = const()[name = string("op_995_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_995_end_mask_0 = const()[name = string("op_995_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_995_cast_fp16 = slice_by_index(begin = var_995_begin_0, end = var_995_end_0, end_mask = var_995_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_995_cast_fp16")];
+            tensor<int32, [4]> var_1001_begin_0 = const()[name = string("op_1001_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1001_end_0 = const()[name = string("op_1001_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1001_end_mask_0 = const()[name = string("op_1001_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1001_cast_fp16 = slice_by_index(begin = var_1001_begin_0, end = var_1001_end_0, end_mask = var_1001_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1001_cast_fp16")];
+            fp16 const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1003_cast_fp16 = mul(x = var_1001_cast_fp16, y = const_63_promoted_to_fp16)[name = string("op_1003_cast_fp16")];
+            bool var_1005_interleave_0 = const()[name = string("op_1005_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1005_cast_fp16 = concat(axis = var_883, interleave = var_1005_interleave_0, values = (var_1003_cast_fp16, var_995_cast_fp16))[name = string("op_1005_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1006_cast_fp16 = mul(x = var_1005_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1006_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_15_cast_fp16 = add(x = var_990_cast_fp16, y = var_1006_cast_fp16)[name = string("mh_q_15_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1008_cast_fp16 = mul(x = mh_k_9_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1013_begin_0 = const()[name = string("op_1013_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1013_end_0 = const()[name = string("op_1013_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1013_end_mask_0 = const()[name = string("op_1013_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1013_cast_fp16 = slice_by_index(begin = var_1013_begin_0, end = var_1013_end_0, end_mask = var_1013_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1013_cast_fp16")];
+            tensor<int32, [4]> var_1019_begin_0 = const()[name = string("op_1019_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1019_end_0 = const()[name = string("op_1019_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1019_end_mask_0 = const()[name = string("op_1019_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1019_cast_fp16 = slice_by_index(begin = var_1019_begin_0, end = var_1019_end_0, end_mask = var_1019_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1019_cast_fp16")];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1021_cast_fp16 = mul(x = var_1019_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_1021_cast_fp16")];
+            bool var_1023_interleave_0 = const()[name = string("op_1023_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1023_cast_fp16 = concat(axis = var_883, interleave = var_1023_interleave_0, values = (var_1021_cast_fp16, var_1013_cast_fp16))[name = string("op_1023_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1024_cast_fp16 = mul(x = var_1023_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1024_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_11_cast_fp16 = add(x = var_1008_cast_fp16, y = var_1024_cast_fp16)[name = string("mh_k_11_cast_fp16")];
+            tensor<int32, [4]> var_1028 = const()[name = string("op_1028"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1028, x = mh_k_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1035_cast_fp16 = mul(x = var_84_cast_fp16_2, y = var_260_cast_fp16)[name = string("op_1035_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1036_cast_fp16 = mul(x = current_key_11_cast_fp16, y = var_258_cast_fp16)[name = string("op_1036_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_15_cast_fp16 = add(x = var_1035_cast_fp16, y = var_1036_cast_fp16)[name = string("key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1039_cast_fp16 = mul(x = var_92_cast_fp16_2, y = var_260_cast_fp16)[name = string("op_1039_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1040_cast_fp16 = mul(x = current_value_5_cast_fp16, y = var_258_cast_fp16)[name = string("op_1040_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_9_cast_fp16 = add(x = var_1039_cast_fp16, y = var_1040_cast_fp16)[name = string("value_9_cast_fp16")];
+            tensor<int32, [4]> var_1044 = const()[name = string("op_1044"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_9_cast_fp16 = reshape(shape = var_1044, x = key_15_cast_fp16)[name = string("key_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1046 = const()[name = string("op_1046"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_9_cast_fp16 = reshape(shape = var_1046, x = value_9_cast_fp16)[name = string("value_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1049_begin_0 = const()[name = string("op_1049_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1049_end_0 = const()[name = string("op_1049_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1049_end_mask_0 = const()[name = string("op_1049_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1049_cast_fp16 = slice_by_index(begin = var_1049_begin_0, end = var_1049_end_0, end_mask = var_1049_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1049_cast_fp16")];
+            tensor<int32, [4]> var_1053_begin_0 = const()[name = string("op_1053_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1053_end_0 = const()[name = string("op_1053_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1053_end_mask_0 = const()[name = string("op_1053_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1053_cast_fp16 = slice_by_index(begin = var_1053_begin_0, end = var_1053_end_0, end_mask = var_1053_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1053_cast_fp16")];
+            tensor<int32, [4]> var_1065_begin_0 = const()[name = string("op_1065_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1065_end_0 = const()[name = string("op_1065_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1065_end_mask_0 = const()[name = string("op_1065_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1065_cast_fp16 = slice_by_index(begin = var_1065_begin_0, end = var_1065_end_0, end_mask = var_1065_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1065_cast_fp16")];
+            tensor<int32, [4]> var_1069_begin_0 = const()[name = string("op_1069_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1069_end_0 = const()[name = string("op_1069_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1069_end_mask_0 = const()[name = string("op_1069_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1069_cast_fp16 = slice_by_index(begin = var_1069_begin_0, end = var_1069_end_0, end_mask = var_1069_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1069_cast_fp16")];
+            tensor<int32, [4]> var_1081_begin_0 = const()[name = string("op_1081_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1081_end_0 = const()[name = string("op_1081_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1081_end_mask_0 = const()[name = string("op_1081_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1081_cast_fp16 = slice_by_index(begin = var_1081_begin_0, end = var_1081_end_0, end_mask = var_1081_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1081_cast_fp16")];
+            tensor<int32, [4]> var_1085_begin_0 = const()[name = string("op_1085_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1085_end_0 = const()[name = string("op_1085_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1085_end_mask_0 = const()[name = string("op_1085_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1085_cast_fp16 = slice_by_index(begin = var_1085_begin_0, end = var_1085_end_0, end_mask = var_1085_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1085_cast_fp16")];
+            tensor<int32, [4]> var_1097_begin_0 = const()[name = string("op_1097_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1097_end_0 = const()[name = string("op_1097_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1097_end_mask_0 = const()[name = string("op_1097_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1097_cast_fp16 = slice_by_index(begin = var_1097_begin_0, end = var_1097_end_0, end_mask = var_1097_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1097_cast_fp16")];
+            tensor<int32, [4]> var_1101_begin_0 = const()[name = string("op_1101_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1101_end_0 = const()[name = string("op_1101_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1101_end_mask_0 = const()[name = string("op_1101_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1101_cast_fp16 = slice_by_index(begin = var_1101_begin_0, end = var_1101_end_0, end_mask = var_1101_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1101_cast_fp16")];
+            tensor<int32, [4]> var_1113_begin_0 = const()[name = string("op_1113_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1113_end_0 = const()[name = string("op_1113_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1113_end_mask_0 = const()[name = string("op_1113_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1113_cast_fp16 = slice_by_index(begin = var_1113_begin_0, end = var_1113_end_0, end_mask = var_1113_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1113_cast_fp16")];
+            tensor<int32, [4]> var_1117_begin_0 = const()[name = string("op_1117_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1117_end_0 = const()[name = string("op_1117_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1117_end_mask_0 = const()[name = string("op_1117_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1117_cast_fp16 = slice_by_index(begin = var_1117_begin_0, end = var_1117_end_0, end_mask = var_1117_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1117_cast_fp16")];
+            tensor<int32, [4]> var_1129_begin_0 = const()[name = string("op_1129_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1129_end_0 = const()[name = string("op_1129_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1129_end_mask_0 = const()[name = string("op_1129_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1129_cast_fp16 = slice_by_index(begin = var_1129_begin_0, end = var_1129_end_0, end_mask = var_1129_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1129_cast_fp16")];
+            tensor<int32, [4]> var_1133_begin_0 = const()[name = string("op_1133_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1133_end_0 = const()[name = string("op_1133_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1133_end_mask_0 = const()[name = string("op_1133_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1133_cast_fp16 = slice_by_index(begin = var_1133_begin_0, end = var_1133_end_0, end_mask = var_1133_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1133_cast_fp16")];
+            tensor<int32, [4]> var_1145_begin_0 = const()[name = string("op_1145_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1145_end_0 = const()[name = string("op_1145_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1145_end_mask_0 = const()[name = string("op_1145_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1145_cast_fp16 = slice_by_index(begin = var_1145_begin_0, end = var_1145_end_0, end_mask = var_1145_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1145_cast_fp16")];
+            tensor<int32, [4]> var_1149_begin_0 = const()[name = string("op_1149_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1149_end_0 = const()[name = string("op_1149_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1149_end_mask_0 = const()[name = string("op_1149_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1149_cast_fp16 = slice_by_index(begin = var_1149_begin_0, end = var_1149_end_0, end_mask = var_1149_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1149_cast_fp16")];
+            tensor<int32, [4]> var_1161_begin_0 = const()[name = string("op_1161_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1161_end_0 = const()[name = string("op_1161_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1161_end_mask_0 = const()[name = string("op_1161_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1161_cast_fp16 = slice_by_index(begin = var_1161_begin_0, end = var_1161_end_0, end_mask = var_1161_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1161_cast_fp16")];
+            tensor<int32, [4]> var_1165_begin_0 = const()[name = string("op_1165_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1165_end_0 = const()[name = string("op_1165_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1165_end_mask_0 = const()[name = string("op_1165_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1165_cast_fp16 = slice_by_index(begin = var_1165_begin_0, end = var_1165_end_0, end_mask = var_1165_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1165_cast_fp16")];
+            bool key_heads_11_interleave_0 = const()[name = string("key_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_11_cast_fp16 = concat(axis = var_891, interleave = key_heads_11_interleave_0, values = (var_1049_cast_fp16, var_1049_cast_fp16, var_1065_cast_fp16, var_1065_cast_fp16, var_1081_cast_fp16, var_1081_cast_fp16, var_1097_cast_fp16, var_1097_cast_fp16, var_1113_cast_fp16, var_1113_cast_fp16, var_1129_cast_fp16, var_1129_cast_fp16, var_1145_cast_fp16, var_1145_cast_fp16, var_1161_cast_fp16, var_1161_cast_fp16))[name = string("key_heads_11_cast_fp16")];
+            bool value_heads_11_interleave_0 = const()[name = string("value_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_11_cast_fp16 = concat(axis = var_891, interleave = value_heads_11_interleave_0, values = (var_1053_cast_fp16, var_1053_cast_fp16, var_1069_cast_fp16, var_1069_cast_fp16, var_1085_cast_fp16, var_1085_cast_fp16, var_1101_cast_fp16, var_1101_cast_fp16, var_1117_cast_fp16, var_1117_cast_fp16, var_1133_cast_fp16, var_1133_cast_fp16, var_1149_cast_fp16, var_1149_cast_fp16, var_1165_cast_fp16, var_1165_cast_fp16))[name = string("value_heads_11_cast_fp16")];
+            fp16 var_1188_to_fp16 = const()[name = string("op_1188_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1189_cast_fp16 = mul(x = mh_q_15_cast_fp16, y = var_1188_to_fp16)[name = string("op_1189_cast_fp16")];
+            bool mh_w_9_transpose_x_0 = const()[name = string("mh_w_9_transpose_x_0"), val = bool(true)];
+            bool mh_w_9_transpose_y_0 = const()[name = string("mh_w_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_9_cast_fp16 = matmul(transpose_x = mh_w_9_transpose_x_0, transpose_y = mh_w_9_transpose_y_0, x = var_1189_cast_fp16, y = key_heads_11_cast_fp16)[name = string("mh_w_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_11_cast_fp16 = add(x = mh_w_9_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_11_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1201_cast_fp16 = softmax(axis = var_873, x = mh_w_11_cast_fp16)[name = string("op_1201_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = value_heads_11_cast_fp16, y = var_1201_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1206 = const()[name = string("op_1206"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_17_cast_fp16 = reshape(shape = var_1206, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            string obj_27_pad_type_0 = const()[name = string("obj_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_27_strides_0 = const()[name = string("obj_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_27_pad_0 = const()[name = string("obj_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_27_dilations_0 = const()[name = string("obj_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_27_groups_0 = const()[name = string("obj_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35689600))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37786816))))[name = string("layers_2_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_27_dilations_0, groups = obj_27_groups_0, pad = obj_27_pad_0, pad_type = obj_27_pad_type_0, strides = obj_27_strides_0, weight = layers_2_self_attn_o_proj_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = string("obj_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_21_cast_fp16 = add(x = inputs_15_cast_fp16, y = obj_27_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1224_to_fp16 = const()[name = string("op_1224_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1225_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1224_to_fp16)[name = string("op_1225_cast_fp16")];
+            fp32 var_1226_epsilon_0 = const()[name = string("op_1226_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1226_cast_fp16 = rsqrt(epsilon = var_1226_epsilon_0, x = var_1225_cast_fp16)[name = string("op_1226_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_1226_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37787392)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_19_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_19_cast_fp16")];
+            string input_21_pad_type_0 = const()[name = string("input_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_21_strides_0 = const()[name = string("input_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_21_pad_0 = const()[name = string("input_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_21_dilations_0 = const()[name = string("input_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_21_groups_0 = const()[name = string("input_21_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37789504))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40935296))))[name = string("layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_21_cast_fp16 = conv(dilations = input_21_dilations_0, groups = input_21_groups_0, pad = input_21_pad_0, pad_type = input_21_pad_type_0, strides = input_21_strides_0, weight = layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("input_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1240_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_1240_cast_fp16")];
+            string var_1246_pad_type_0 = const()[name = string("op_1246_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1246_strides_0 = const()[name = string("op_1246_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1246_pad_0 = const()[name = string("op_1246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1246_dilations_0 = const()[name = string("op_1246_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1246_groups_0 = const()[name = string("op_1246_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40935872))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44081664))))[name = string("layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1246_cast_fp16 = conv(dilations = var_1246_dilations_0, groups = var_1246_groups_0, pad = var_1246_pad_0, pad_type = var_1246_pad_type_0, strides = var_1246_strides_0, weight = layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("op_1246_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_23_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1246_cast_fp16)[name = string("input_23_cast_fp16")];
+            string hidden_states_29_pad_type_0 = const()[name = string("hidden_states_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_29_strides_0 = const()[name = string("hidden_states_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_29_pad_0 = const()[name = string("hidden_states_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_29_dilations_0 = const()[name = string("hidden_states_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_29_groups_0 = const()[name = string("hidden_states_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_2_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44082240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47228032))))[name = string("layers_2_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_29_cast_fp16 = conv(dilations = hidden_states_29_dilations_0, groups = hidden_states_29_groups_0, pad = hidden_states_29_pad_0, pad_type = hidden_states_29_pad_type_0, strides = hidden_states_29_strides_0, weight = layers_2_mlp_down_proj_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_21_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            int32 var_1260 = const()[name = string("op_1260"), val = int32(3)];
+            int32 var_1270 = const()[name = string("op_1270"), val = int32(-2)];
+            int32 var_1278 = const()[name = string("op_1278"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1290_to_fp16 = const()[name = string("op_1290_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1291_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1290_to_fp16)[name = string("op_1291_cast_fp16")];
+            fp32 var_1292_epsilon_0 = const()[name = string("op_1292_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1292_cast_fp16 = rsqrt(epsilon = var_1292_epsilon_0, x = var_1291_cast_fp16)[name = string("op_1292_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1292_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47228608)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_29_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_31_cast_fp16)[name = string("obj_29_cast_fp16")];
+            string query_19_pad_type_0 = const()[name = string("query_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_19_strides_0 = const()[name = string("query_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_19_pad_0 = const()[name = string("query_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_19_dilations_0 = const()[name = string("query_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_19_groups_0 = const()[name = string("query_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47230720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49327936))))[name = string("layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_19_dilations_0, groups = query_19_groups_0, pad = query_19_pad_0, pad_type = query_19_pad_type_0, strides = query_19_strides_0, weight = layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("query_19_cast_fp16")];
+            string current_key_13_pad_type_0 = const()[name = string("current_key_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_13_strides_0 = const()[name = string("current_key_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_13_pad_0 = const()[name = string("current_key_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_13_dilations_0 = const()[name = string("current_key_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_13_groups_0 = const()[name = string("current_key_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49328512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50377152))))[name = string("layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = conv(dilations = current_key_13_dilations_0, groups = current_key_13_groups_0, pad = current_key_13_pad_0, pad_type = current_key_13_pad_type_0, strides = current_key_13_strides_0, weight = layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            string current_value_7_pad_type_0 = const()[name = string("current_value_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_7_strides_0 = const()[name = string("current_value_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_7_pad_0 = const()[name = string("current_value_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_7_dilations_0 = const()[name = string("current_value_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_7_groups_0 = const()[name = string("current_value_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50377728))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51426368))))[name = string("layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_7_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_7_dilations_0, groups = current_value_7_groups_0, pad = current_value_7_pad_0, pad_type = current_value_7_pad_type_0, strides = current_value_7_strides_0, weight = layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_value_7_cast_fp16")];
+            tensor<int32, [4]> var_1329 = const()[name = string("op_1329"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_25_cast_fp16 = reshape(shape = var_1329, x = query_19_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1335_to_fp16 = const()[name = string("op_1335_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1336_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1335_to_fp16)[name = string("op_1336_cast_fp16")];
+            fp32 var_1337_epsilon_0 = const()[name = string("op_1337_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1337_cast_fp16 = rsqrt(epsilon = var_1337_epsilon_0, x = var_1336_cast_fp16)[name = string("op_1337_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_33_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1337_cast_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51426944)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_7_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_33_cast_fp16)[name = string("query_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1345 = const()[name = string("op_1345"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_27_cast_fp16 = reshape(shape = var_1345, x = current_key_13_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1351_to_fp16 = const()[name = string("op_1351_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1352_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1351_to_fp16)[name = string("op_1352_cast_fp16")];
+            fp32 var_1353_epsilon_0 = const()[name = string("op_1353_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1353_cast_fp16 = rsqrt(epsilon = var_1353_epsilon_0, x = var_1352_cast_fp16)[name = string("op_1353_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_35_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1353_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51427264)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_7_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_35_cast_fp16)[name = string("current_key_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1371 = const()[name = string("op_1371"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_19_cast_fp16 = reshape(shape = var_1371, x = query_normed_7_cast_fp16)[name = string("mh_q_19_cast_fp16")];
+            tensor<int32, [4]> var_1373 = const()[name = string("op_1373"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_13_cast_fp16 = reshape(shape = var_1373, x = current_key_normed_7_cast_fp16)[name = string("mh_k_13_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1377_cast_fp16 = mul(x = mh_q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1377_cast_fp16")];
+            tensor<int32, [4]> var_1382_begin_0 = const()[name = string("op_1382_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1382_end_0 = const()[name = string("op_1382_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1382_end_mask_0 = const()[name = string("op_1382_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1382_cast_fp16 = slice_by_index(begin = var_1382_begin_0, end = var_1382_end_0, end_mask = var_1382_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1382_cast_fp16")];
+            tensor<int32, [4]> var_1388_begin_0 = const()[name = string("op_1388_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1388_end_0 = const()[name = string("op_1388_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1388_end_mask_0 = const()[name = string("op_1388_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1388_cast_fp16 = slice_by_index(begin = var_1388_begin_0, end = var_1388_end_0, end_mask = var_1388_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1388_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1390_cast_fp16 = mul(x = var_1388_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1390_cast_fp16")];
+            bool var_1392_interleave_0 = const()[name = string("op_1392_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1392_cast_fp16 = concat(axis = var_1270, interleave = var_1392_interleave_0, values = (var_1390_cast_fp16, var_1382_cast_fp16))[name = string("op_1392_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1393_cast_fp16 = mul(x = var_1392_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1393_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_21_cast_fp16 = add(x = var_1377_cast_fp16, y = var_1393_cast_fp16)[name = string("mh_q_21_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1395_cast_fp16 = mul(x = mh_k_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1395_cast_fp16")];
+            tensor<int32, [4]> var_1400_begin_0 = const()[name = string("op_1400_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1400_end_0 = const()[name = string("op_1400_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1400_end_mask_0 = const()[name = string("op_1400_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1400_cast_fp16 = slice_by_index(begin = var_1400_begin_0, end = var_1400_end_0, end_mask = var_1400_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1400_cast_fp16")];
+            tensor<int32, [4]> var_1406_begin_0 = const()[name = string("op_1406_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1406_end_0 = const()[name = string("op_1406_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1406_end_mask_0 = const()[name = string("op_1406_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1406_cast_fp16 = slice_by_index(begin = var_1406_begin_0, end = var_1406_end_0, end_mask = var_1406_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1406_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1408_cast_fp16 = mul(x = var_1406_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1408_cast_fp16")];
+            bool var_1410_interleave_0 = const()[name = string("op_1410_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1410_cast_fp16 = concat(axis = var_1270, interleave = var_1410_interleave_0, values = (var_1408_cast_fp16, var_1400_cast_fp16))[name = string("op_1410_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1411_cast_fp16 = mul(x = var_1410_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1411_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_15_cast_fp16 = add(x = var_1395_cast_fp16, y = var_1411_cast_fp16)[name = string("mh_k_15_cast_fp16")];
+            tensor<int32, [4]> var_1415 = const()[name = string("op_1415"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_15_cast_fp16 = reshape(shape = var_1415, x = mh_k_15_cast_fp16)[name = string("current_key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1422_cast_fp16 = mul(x = var_84_cast_fp16_3, y = var_260_cast_fp16)[name = string("op_1422_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1423_cast_fp16 = mul(x = current_key_15_cast_fp16, y = var_258_cast_fp16)[name = string("op_1423_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_21_cast_fp16 = add(x = var_1422_cast_fp16, y = var_1423_cast_fp16)[name = string("key_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1426_cast_fp16 = mul(x = var_92_cast_fp16_3, y = var_260_cast_fp16)[name = string("op_1426_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1427_cast_fp16 = mul(x = current_value_7_cast_fp16, y = var_258_cast_fp16)[name = string("op_1427_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_13_cast_fp16 = add(x = var_1426_cast_fp16, y = var_1427_cast_fp16)[name = string("value_13_cast_fp16")];
+            tensor<int32, [4]> var_1431 = const()[name = string("op_1431"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_13_cast_fp16 = reshape(shape = var_1431, x = key_21_cast_fp16)[name = string("key_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1433 = const()[name = string("op_1433"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_13_cast_fp16 = reshape(shape = var_1433, x = value_13_cast_fp16)[name = string("value_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1436_begin_0 = const()[name = string("op_1436_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1436_end_0 = const()[name = string("op_1436_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1436_end_mask_0 = const()[name = string("op_1436_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1436_cast_fp16 = slice_by_index(begin = var_1436_begin_0, end = var_1436_end_0, end_mask = var_1436_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1436_cast_fp16")];
+            tensor<int32, [4]> var_1440_begin_0 = const()[name = string("op_1440_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1440_end_0 = const()[name = string("op_1440_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1440_end_mask_0 = const()[name = string("op_1440_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1440_cast_fp16 = slice_by_index(begin = var_1440_begin_0, end = var_1440_end_0, end_mask = var_1440_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1440_cast_fp16")];
+            tensor<int32, [4]> var_1452_begin_0 = const()[name = string("op_1452_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1452_end_0 = const()[name = string("op_1452_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1452_end_mask_0 = const()[name = string("op_1452_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1452_cast_fp16 = slice_by_index(begin = var_1452_begin_0, end = var_1452_end_0, end_mask = var_1452_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1452_cast_fp16")];
+            tensor<int32, [4]> var_1456_begin_0 = const()[name = string("op_1456_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1456_end_0 = const()[name = string("op_1456_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1456_end_mask_0 = const()[name = string("op_1456_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1456_cast_fp16 = slice_by_index(begin = var_1456_begin_0, end = var_1456_end_0, end_mask = var_1456_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1456_cast_fp16")];
+            tensor<int32, [4]> var_1468_begin_0 = const()[name = string("op_1468_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1468_end_0 = const()[name = string("op_1468_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1468_end_mask_0 = const()[name = string("op_1468_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1468_cast_fp16 = slice_by_index(begin = var_1468_begin_0, end = var_1468_end_0, end_mask = var_1468_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1468_cast_fp16")];
+            tensor<int32, [4]> var_1472_begin_0 = const()[name = string("op_1472_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1472_end_0 = const()[name = string("op_1472_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1472_end_mask_0 = const()[name = string("op_1472_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1472_cast_fp16 = slice_by_index(begin = var_1472_begin_0, end = var_1472_end_0, end_mask = var_1472_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1472_cast_fp16")];
+            tensor<int32, [4]> var_1484_begin_0 = const()[name = string("op_1484_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1484_end_0 = const()[name = string("op_1484_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1484_end_mask_0 = const()[name = string("op_1484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1484_cast_fp16 = slice_by_index(begin = var_1484_begin_0, end = var_1484_end_0, end_mask = var_1484_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1484_cast_fp16")];
+            tensor<int32, [4]> var_1488_begin_0 = const()[name = string("op_1488_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1488_end_0 = const()[name = string("op_1488_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1488_end_mask_0 = const()[name = string("op_1488_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1488_cast_fp16 = slice_by_index(begin = var_1488_begin_0, end = var_1488_end_0, end_mask = var_1488_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1488_cast_fp16")];
+            tensor<int32, [4]> var_1500_begin_0 = const()[name = string("op_1500_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1500_end_0 = const()[name = string("op_1500_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1500_end_mask_0 = const()[name = string("op_1500_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1500_cast_fp16 = slice_by_index(begin = var_1500_begin_0, end = var_1500_end_0, end_mask = var_1500_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1500_cast_fp16")];
+            tensor<int32, [4]> var_1504_begin_0 = const()[name = string("op_1504_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1504_end_0 = const()[name = string("op_1504_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1504_end_mask_0 = const()[name = string("op_1504_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1504_cast_fp16 = slice_by_index(begin = var_1504_begin_0, end = var_1504_end_0, end_mask = var_1504_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1504_cast_fp16")];
+            tensor<int32, [4]> var_1516_begin_0 = const()[name = string("op_1516_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1516_end_0 = const()[name = string("op_1516_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1516_end_mask_0 = const()[name = string("op_1516_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1516_cast_fp16 = slice_by_index(begin = var_1516_begin_0, end = var_1516_end_0, end_mask = var_1516_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1516_cast_fp16")];
+            tensor<int32, [4]> var_1520_begin_0 = const()[name = string("op_1520_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1520_end_0 = const()[name = string("op_1520_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1520_end_mask_0 = const()[name = string("op_1520_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1520_cast_fp16 = slice_by_index(begin = var_1520_begin_0, end = var_1520_end_0, end_mask = var_1520_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1520_cast_fp16")];
+            tensor<int32, [4]> var_1532_begin_0 = const()[name = string("op_1532_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1532_end_0 = const()[name = string("op_1532_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1532_end_mask_0 = const()[name = string("op_1532_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1532_cast_fp16 = slice_by_index(begin = var_1532_begin_0, end = var_1532_end_0, end_mask = var_1532_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1532_cast_fp16")];
+            tensor<int32, [4]> var_1536_begin_0 = const()[name = string("op_1536_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1536_end_0 = const()[name = string("op_1536_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1536_end_mask_0 = const()[name = string("op_1536_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1536_cast_fp16 = slice_by_index(begin = var_1536_begin_0, end = var_1536_end_0, end_mask = var_1536_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1536_cast_fp16")];
+            tensor<int32, [4]> var_1548_begin_0 = const()[name = string("op_1548_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1548_end_0 = const()[name = string("op_1548_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1548_end_mask_0 = const()[name = string("op_1548_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1548_cast_fp16 = slice_by_index(begin = var_1548_begin_0, end = var_1548_end_0, end_mask = var_1548_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1548_cast_fp16")];
+            tensor<int32, [4]> var_1552_begin_0 = const()[name = string("op_1552_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1552_end_0 = const()[name = string("op_1552_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1552_end_mask_0 = const()[name = string("op_1552_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1552_cast_fp16 = slice_by_index(begin = var_1552_begin_0, end = var_1552_end_0, end_mask = var_1552_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1552_cast_fp16")];
+            bool key_heads_15_interleave_0 = const()[name = string("key_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_15_cast_fp16 = concat(axis = var_1278, interleave = key_heads_15_interleave_0, values = (var_1436_cast_fp16, var_1436_cast_fp16, var_1452_cast_fp16, var_1452_cast_fp16, var_1468_cast_fp16, var_1468_cast_fp16, var_1484_cast_fp16, var_1484_cast_fp16, var_1500_cast_fp16, var_1500_cast_fp16, var_1516_cast_fp16, var_1516_cast_fp16, var_1532_cast_fp16, var_1532_cast_fp16, var_1548_cast_fp16, var_1548_cast_fp16))[name = string("key_heads_15_cast_fp16")];
+            bool value_heads_15_interleave_0 = const()[name = string("value_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_15_cast_fp16 = concat(axis = var_1278, interleave = value_heads_15_interleave_0, values = (var_1440_cast_fp16, var_1440_cast_fp16, var_1456_cast_fp16, var_1456_cast_fp16, var_1472_cast_fp16, var_1472_cast_fp16, var_1488_cast_fp16, var_1488_cast_fp16, var_1504_cast_fp16, var_1504_cast_fp16, var_1520_cast_fp16, var_1520_cast_fp16, var_1536_cast_fp16, var_1536_cast_fp16, var_1552_cast_fp16, var_1552_cast_fp16))[name = string("value_heads_15_cast_fp16")];
+            fp16 var_1575_to_fp16 = const()[name = string("op_1575_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1576_cast_fp16 = mul(x = mh_q_21_cast_fp16, y = var_1575_to_fp16)[name = string("op_1576_cast_fp16")];
+            bool mh_w_13_transpose_x_0 = const()[name = string("mh_w_13_transpose_x_0"), val = bool(true)];
+            bool mh_w_13_transpose_y_0 = const()[name = string("mh_w_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_13_cast_fp16 = matmul(transpose_x = mh_w_13_transpose_x_0, transpose_y = mh_w_13_transpose_y_0, x = var_1576_cast_fp16, y = key_heads_15_cast_fp16)[name = string("mh_w_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_15_cast_fp16 = add(x = mh_w_13_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_15_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1588_cast_fp16 = softmax(axis = var_1260, x = mh_w_15_cast_fp16)[name = string("op_1588_cast_fp16")];
+            bool attn_7_transpose_x_0 = const()[name = string("attn_7_transpose_x_0"), val = bool(false)];
+            bool attn_7_transpose_y_0 = const()[name = string("attn_7_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_7_cast_fp16 = matmul(transpose_x = attn_7_transpose_x_0, transpose_y = attn_7_transpose_y_0, x = value_heads_15_cast_fp16, y = var_1588_cast_fp16)[name = string("attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1593 = const()[name = string("op_1593"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_25_cast_fp16 = reshape(shape = var_1593, x = attn_7_cast_fp16)[name = string("input_25_cast_fp16")];
+            string obj_35_pad_type_0 = const()[name = string("obj_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_35_strides_0 = const()[name = string("obj_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_35_pad_0 = const()[name = string("obj_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_35_dilations_0 = const()[name = string("obj_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_35_groups_0 = const()[name = string("obj_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51427584))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53524800))))[name = string("layers_3_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_35_dilations_0, groups = obj_35_groups_0, pad = obj_35_pad_0, pad_type = obj_35_pad_type_0, strides = obj_35_strides_0, weight = layers_3_self_attn_o_proj_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = string("obj_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_29_cast_fp16 = add(x = inputs_23_cast_fp16, y = obj_35_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_1611_to_fp16 = const()[name = string("op_1611_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1612_cast_fp16 = add(x = variance_31_cast_fp16, y = var_1611_to_fp16)[name = string("op_1612_cast_fp16")];
+            fp32 var_1613_epsilon_0 = const()[name = string("op_1613_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1613_cast_fp16 = rsqrt(epsilon = var_1613_epsilon_0, x = var_1612_cast_fp16)[name = string("op_1613_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_37_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1613_cast_fp16)[name = string("hidden_states_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53525376)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_27_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_37_cast_fp16)[name = string("input_27_cast_fp16")];
+            string input_29_pad_type_0 = const()[name = string("input_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_29_strides_0 = const()[name = string("input_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_29_pad_0 = const()[name = string("input_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_29_dilations_0 = const()[name = string("input_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_29_groups_0 = const()[name = string("input_29_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53527488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56673280))))[name = string("layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_29_cast_fp16 = conv(dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("input_29_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1627_cast_fp16 = silu(x = input_29_cast_fp16)[name = string("op_1627_cast_fp16")];
+            string var_1633_pad_type_0 = const()[name = string("op_1633_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1633_strides_0 = const()[name = string("op_1633_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1633_pad_0 = const()[name = string("op_1633_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1633_dilations_0 = const()[name = string("op_1633_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1633_groups_0 = const()[name = string("op_1633_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56673856))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59819648))))[name = string("layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1633_cast_fp16 = conv(dilations = var_1633_dilations_0, groups = var_1633_groups_0, pad = var_1633_pad_0, pad_type = var_1633_pad_type_0, strides = var_1633_strides_0, weight = layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("op_1633_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_31_cast_fp16 = mul(x = var_1627_cast_fp16, y = var_1633_cast_fp16)[name = string("input_31_cast_fp16")];
+            string hidden_states_39_pad_type_0 = const()[name = string("hidden_states_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_39_strides_0 = const()[name = string("hidden_states_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_39_pad_0 = const()[name = string("hidden_states_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_39_dilations_0 = const()[name = string("hidden_states_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_39_groups_0 = const()[name = string("hidden_states_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_3_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59820224))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62966016))))[name = string("layers_3_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_39_cast_fp16 = conv(dilations = hidden_states_39_dilations_0, groups = hidden_states_39_groups_0, pad = hidden_states_39_pad_0, pad_type = hidden_states_39_pad_type_0, strides = hidden_states_39_strides_0, weight = layers_3_mlp_down_proj_weight_to_fp16_palettized, x = input_31_cast_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_29_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            int32 var_1647 = const()[name = string("op_1647"), val = int32(3)];
+            int32 var_1657 = const()[name = string("op_1657"), val = int32(-2)];
+            int32 var_1665 = const()[name = string("op_1665"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_33_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_33_cast_fp16")];
+            tensor<int32, [1]> variance_33_axes_0 = const()[name = string("variance_33_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_33_keep_dims_0 = const()[name = string("variance_33_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_33_cast_fp16 = reduce_mean(axes = variance_33_axes_0, keep_dims = variance_33_keep_dims_0, x = inputs_sq_33_cast_fp16)[name = string("variance_33_cast_fp16")];
+            fp16 var_1677_to_fp16 = const()[name = string("op_1677_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1678_cast_fp16 = add(x = variance_33_cast_fp16, y = var_1677_to_fp16)[name = string("op_1678_cast_fp16")];
+            fp32 var_1679_epsilon_0 = const()[name = string("op_1679_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1679_cast_fp16 = rsqrt(epsilon = var_1679_epsilon_0, x = var_1678_cast_fp16)[name = string("op_1679_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_41_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_1679_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_33_to_fp16 = const()[name = string("w_33_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62966592)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_37_cast_fp16 = mul(x = w_33_to_fp16, y = hidden_states_41_cast_fp16)[name = string("obj_37_cast_fp16")];
+            string query_25_pad_type_0 = const()[name = string("query_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_25_strides_0 = const()[name = string("query_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_25_pad_0 = const()[name = string("query_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_25_dilations_0 = const()[name = string("query_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_25_groups_0 = const()[name = string("query_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62968704))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65065920))))[name = string("layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_25_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_25_dilations_0, groups = query_25_groups_0, pad = query_25_pad_0, pad_type = query_25_pad_type_0, strides = query_25_strides_0, weight = layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("query_25_cast_fp16")];
+            string current_key_17_pad_type_0 = const()[name = string("current_key_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_17_strides_0 = const()[name = string("current_key_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_17_pad_0 = const()[name = string("current_key_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_17_dilations_0 = const()[name = string("current_key_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_17_groups_0 = const()[name = string("current_key_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65066496))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66115136))))[name = string("layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_17_cast_fp16 = conv(dilations = current_key_17_dilations_0, groups = current_key_17_groups_0, pad = current_key_17_pad_0, pad_type = current_key_17_pad_type_0, strides = current_key_17_strides_0, weight = layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_key_17_cast_fp16")];
+            string current_value_pad_type_0 = const()[name = string("current_value_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_strides_0 = const()[name = string("current_value_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_pad_0 = const()[name = string("current_value_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_dilations_0 = const()[name = string("current_value_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_groups_0 = const()[name = string("current_value_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66115712))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67164352))))[name = string("layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_dilations_0, groups = current_value_groups_0, pad = current_value_pad_0, pad_type = current_value_pad_type_0, strides = current_value_strides_0, weight = layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_value_cast_fp16")];
+            tensor<int32, [4]> var_1716 = const()[name = string("op_1716"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_33_cast_fp16 = reshape(shape = var_1716, x = query_25_cast_fp16)[name = string("inputs_33_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_35_cast_fp16 = mul(x = inputs_33_cast_fp16, y = inputs_33_cast_fp16)[name = string("inputs_sq_35_cast_fp16")];
+            tensor<int32, [1]> variance_35_axes_0 = const()[name = string("variance_35_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_35_keep_dims_0 = const()[name = string("variance_35_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_35_cast_fp16 = reduce_mean(axes = variance_35_axes_0, keep_dims = variance_35_keep_dims_0, x = inputs_sq_35_cast_fp16)[name = string("variance_35_cast_fp16")];
+            fp16 var_1722_to_fp16 = const()[name = string("op_1722_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1723_cast_fp16 = add(x = variance_35_cast_fp16, y = var_1722_to_fp16)[name = string("op_1723_cast_fp16")];
+            fp32 var_1724_epsilon_0 = const()[name = string("op_1724_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1724_cast_fp16 = rsqrt(epsilon = var_1724_epsilon_0, x = var_1723_cast_fp16)[name = string("op_1724_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_43_cast_fp16 = mul(x = inputs_33_cast_fp16, y = var_1724_cast_fp16)[name = string("hidden_states_43_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_35_to_fp16 = const()[name = string("w_35_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67164928)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_cast_fp16 = mul(x = w_35_to_fp16, y = hidden_states_43_cast_fp16)[name = string("query_normed_cast_fp16")];
+            tensor<int32, [4]> var_1732 = const()[name = string("op_1732"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_35_cast_fp16 = reshape(shape = var_1732, x = current_key_17_cast_fp16)[name = string("inputs_35_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_37_cast_fp16 = mul(x = inputs_35_cast_fp16, y = inputs_35_cast_fp16)[name = string("inputs_sq_37_cast_fp16")];
+            tensor<int32, [1]> variance_37_axes_0 = const()[name = string("variance_37_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_37_keep_dims_0 = const()[name = string("variance_37_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_37_cast_fp16 = reduce_mean(axes = variance_37_axes_0, keep_dims = variance_37_keep_dims_0, x = inputs_sq_37_cast_fp16)[name = string("variance_37_cast_fp16")];
+            fp16 var_1738_to_fp16 = const()[name = string("op_1738_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1739_cast_fp16 = add(x = variance_37_cast_fp16, y = var_1738_to_fp16)[name = string("op_1739_cast_fp16")];
+            fp32 var_1740_epsilon_0 = const()[name = string("op_1740_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1740_cast_fp16 = rsqrt(epsilon = var_1740_epsilon_0, x = var_1739_cast_fp16)[name = string("op_1740_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_45_cast_fp16 = mul(x = inputs_35_cast_fp16, y = var_1740_cast_fp16)[name = string("hidden_states_45_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_37_to_fp16 = const()[name = string("w_37_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67165248)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_cast_fp16 = mul(x = w_37_to_fp16, y = hidden_states_45_cast_fp16)[name = string("current_key_normed_cast_fp16")];
+            tensor<int32, [4]> var_1758 = const()[name = string("op_1758"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_25_cast_fp16 = reshape(shape = var_1758, x = query_normed_cast_fp16)[name = string("mh_q_25_cast_fp16")];
+            tensor<int32, [4]> var_1760 = const()[name = string("op_1760"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_17_cast_fp16 = reshape(shape = var_1760, x = current_key_normed_cast_fp16)[name = string("mh_k_17_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1764_cast_fp16 = mul(x = mh_q_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1764_cast_fp16")];
+            tensor<int32, [4]> var_1769_begin_0 = const()[name = string("op_1769_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1769_end_0 = const()[name = string("op_1769_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1769_end_mask_0 = const()[name = string("op_1769_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1769_cast_fp16 = slice_by_index(begin = var_1769_begin_0, end = var_1769_end_0, end_mask = var_1769_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1769_cast_fp16")];
+            tensor<int32, [4]> var_1775_begin_0 = const()[name = string("op_1775_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1775_end_0 = const()[name = string("op_1775_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1775_end_mask_0 = const()[name = string("op_1775_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1775_cast_fp16 = slice_by_index(begin = var_1775_begin_0, end = var_1775_end_0, end_mask = var_1775_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1775_cast_fp16")];
+            fp16 const_109_promoted_to_fp16 = const()[name = string("const_109_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1777_cast_fp16 = mul(x = var_1775_cast_fp16, y = const_109_promoted_to_fp16)[name = string("op_1777_cast_fp16")];
+            bool var_1779_interleave_0 = const()[name = string("op_1779_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1779_cast_fp16 = concat(axis = var_1657, interleave = var_1779_interleave_0, values = (var_1777_cast_fp16, var_1769_cast_fp16))[name = string("op_1779_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1780_cast_fp16 = mul(x = var_1779_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1780_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_27_cast_fp16 = add(x = var_1764_cast_fp16, y = var_1780_cast_fp16)[name = string("mh_q_27_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1782_cast_fp16 = mul(x = mh_k_17_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1782_cast_fp16")];
+            tensor<int32, [4]> var_1787_begin_0 = const()[name = string("op_1787_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1787_end_0 = const()[name = string("op_1787_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1787_end_mask_0 = const()[name = string("op_1787_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1787_cast_fp16 = slice_by_index(begin = var_1787_begin_0, end = var_1787_end_0, end_mask = var_1787_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1787_cast_fp16")];
+            tensor<int32, [4]> var_1793_begin_0 = const()[name = string("op_1793_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1793_end_0 = const()[name = string("op_1793_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1793_end_mask_0 = const()[name = string("op_1793_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1793_cast_fp16 = slice_by_index(begin = var_1793_begin_0, end = var_1793_end_0, end_mask = var_1793_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1793_cast_fp16")];
+            fp16 const_112_promoted_to_fp16 = const()[name = string("const_112_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1795_cast_fp16 = mul(x = var_1793_cast_fp16, y = const_112_promoted_to_fp16)[name = string("op_1795_cast_fp16")];
+            bool var_1797_interleave_0 = const()[name = string("op_1797_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1797_cast_fp16 = concat(axis = var_1657, interleave = var_1797_interleave_0, values = (var_1795_cast_fp16, var_1787_cast_fp16))[name = string("op_1797_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1798_cast_fp16 = mul(x = var_1797_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1798_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_cast_fp16 = add(x = var_1782_cast_fp16, y = var_1798_cast_fp16)[name = string("mh_k_cast_fp16")];
+            tensor<int32, [4]> var_1802 = const()[name = string("op_1802"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_1802, x = mh_k_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1809_cast_fp16 = mul(x = var_84_cast_fp16_4, y = var_260_cast_fp16)[name = string("op_1809_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1810_cast_fp16 = mul(x = current_key_cast_fp16, y = var_258_cast_fp16)[name = string("op_1810_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_27_cast_fp16 = add(x = var_1809_cast_fp16, y = var_1810_cast_fp16)[name = string("key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1813_cast_fp16 = mul(x = var_92_cast_fp16_4, y = var_260_cast_fp16)[name = string("op_1813_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1814_cast_fp16 = mul(x = current_value_cast_fp16, y = var_258_cast_fp16)[name = string("op_1814_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_17_cast_fp16 = add(x = var_1813_cast_fp16, y = var_1814_cast_fp16)[name = string("value_17_cast_fp16")];
+            tensor<int32, [4]> var_1818 = const()[name = string("op_1818"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_17_cast_fp16 = reshape(shape = var_1818, x = key_27_cast_fp16)[name = string("key_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1820 = const()[name = string("op_1820"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_17_cast_fp16 = reshape(shape = var_1820, x = value_17_cast_fp16)[name = string("value_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1823_begin_0 = const()[name = string("op_1823_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1823_end_0 = const()[name = string("op_1823_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1823_end_mask_0 = const()[name = string("op_1823_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1823_cast_fp16 = slice_by_index(begin = var_1823_begin_0, end = var_1823_end_0, end_mask = var_1823_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1823_cast_fp16")];
+            tensor<int32, [4]> var_1827_begin_0 = const()[name = string("op_1827_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1827_end_0 = const()[name = string("op_1827_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1827_end_mask_0 = const()[name = string("op_1827_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1827_cast_fp16 = slice_by_index(begin = var_1827_begin_0, end = var_1827_end_0, end_mask = var_1827_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1827_cast_fp16")];
+            tensor<int32, [4]> var_1839_begin_0 = const()[name = string("op_1839_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1839_end_0 = const()[name = string("op_1839_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1839_end_mask_0 = const()[name = string("op_1839_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1839_cast_fp16 = slice_by_index(begin = var_1839_begin_0, end = var_1839_end_0, end_mask = var_1839_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1839_cast_fp16")];
+            tensor<int32, [4]> var_1843_begin_0 = const()[name = string("op_1843_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1843_end_0 = const()[name = string("op_1843_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1843_end_mask_0 = const()[name = string("op_1843_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1843_cast_fp16 = slice_by_index(begin = var_1843_begin_0, end = var_1843_end_0, end_mask = var_1843_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1843_cast_fp16")];
+            tensor<int32, [4]> var_1855_begin_0 = const()[name = string("op_1855_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1855_end_0 = const()[name = string("op_1855_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1855_end_mask_0 = const()[name = string("op_1855_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1855_cast_fp16 = slice_by_index(begin = var_1855_begin_0, end = var_1855_end_0, end_mask = var_1855_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1855_cast_fp16")];
+            tensor<int32, [4]> var_1859_begin_0 = const()[name = string("op_1859_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1859_end_0 = const()[name = string("op_1859_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1859_end_mask_0 = const()[name = string("op_1859_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1859_cast_fp16 = slice_by_index(begin = var_1859_begin_0, end = var_1859_end_0, end_mask = var_1859_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1859_cast_fp16")];
+            tensor<int32, [4]> var_1871_begin_0 = const()[name = string("op_1871_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1871_end_0 = const()[name = string("op_1871_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1871_end_mask_0 = const()[name = string("op_1871_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1871_cast_fp16 = slice_by_index(begin = var_1871_begin_0, end = var_1871_end_0, end_mask = var_1871_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1871_cast_fp16")];
+            tensor<int32, [4]> var_1875_begin_0 = const()[name = string("op_1875_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1875_end_0 = const()[name = string("op_1875_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1875_end_mask_0 = const()[name = string("op_1875_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1875_cast_fp16 = slice_by_index(begin = var_1875_begin_0, end = var_1875_end_0, end_mask = var_1875_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1875_cast_fp16")];
+            tensor<int32, [4]> var_1887_begin_0 = const()[name = string("op_1887_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1887_end_0 = const()[name = string("op_1887_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1887_end_mask_0 = const()[name = string("op_1887_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1887_cast_fp16 = slice_by_index(begin = var_1887_begin_0, end = var_1887_end_0, end_mask = var_1887_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1887_cast_fp16")];
+            tensor<int32, [4]> var_1891_begin_0 = const()[name = string("op_1891_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1891_end_0 = const()[name = string("op_1891_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1891_end_mask_0 = const()[name = string("op_1891_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1891_cast_fp16 = slice_by_index(begin = var_1891_begin_0, end = var_1891_end_0, end_mask = var_1891_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1891_cast_fp16")];
+            tensor<int32, [4]> var_1903_begin_0 = const()[name = string("op_1903_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1903_end_0 = const()[name = string("op_1903_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1903_end_mask_0 = const()[name = string("op_1903_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1903_cast_fp16 = slice_by_index(begin = var_1903_begin_0, end = var_1903_end_0, end_mask = var_1903_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1903_cast_fp16")];
+            tensor<int32, [4]> var_1907_begin_0 = const()[name = string("op_1907_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1907_end_0 = const()[name = string("op_1907_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1907_end_mask_0 = const()[name = string("op_1907_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1907_cast_fp16 = slice_by_index(begin = var_1907_begin_0, end = var_1907_end_0, end_mask = var_1907_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1907_cast_fp16")];
+            tensor<int32, [4]> var_1919_begin_0 = const()[name = string("op_1919_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1919_end_0 = const()[name = string("op_1919_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1919_end_mask_0 = const()[name = string("op_1919_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1919_cast_fp16 = slice_by_index(begin = var_1919_begin_0, end = var_1919_end_0, end_mask = var_1919_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1919_cast_fp16")];
+            tensor<int32, [4]> var_1923_begin_0 = const()[name = string("op_1923_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1923_end_0 = const()[name = string("op_1923_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1923_end_mask_0 = const()[name = string("op_1923_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1923_cast_fp16 = slice_by_index(begin = var_1923_begin_0, end = var_1923_end_0, end_mask = var_1923_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1923_cast_fp16")];
+            tensor<int32, [4]> var_1935_begin_0 = const()[name = string("op_1935_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1935_end_0 = const()[name = string("op_1935_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1935_end_mask_0 = const()[name = string("op_1935_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1935_cast_fp16 = slice_by_index(begin = var_1935_begin_0, end = var_1935_end_0, end_mask = var_1935_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1935_cast_fp16")];
+            tensor<int32, [4]> var_1939_begin_0 = const()[name = string("op_1939_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1939_end_0 = const()[name = string("op_1939_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1939_end_mask_0 = const()[name = string("op_1939_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1939_cast_fp16 = slice_by_index(begin = var_1939_begin_0, end = var_1939_end_0, end_mask = var_1939_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1939_cast_fp16")];
+            bool key_heads_interleave_0 = const()[name = string("key_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_cast_fp16 = concat(axis = var_1665, interleave = key_heads_interleave_0, values = (var_1823_cast_fp16, var_1823_cast_fp16, var_1839_cast_fp16, var_1839_cast_fp16, var_1855_cast_fp16, var_1855_cast_fp16, var_1871_cast_fp16, var_1871_cast_fp16, var_1887_cast_fp16, var_1887_cast_fp16, var_1903_cast_fp16, var_1903_cast_fp16, var_1919_cast_fp16, var_1919_cast_fp16, var_1935_cast_fp16, var_1935_cast_fp16))[name = string("key_heads_cast_fp16")];
+            bool value_heads_interleave_0 = const()[name = string("value_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_cast_fp16 = concat(axis = var_1665, interleave = value_heads_interleave_0, values = (var_1827_cast_fp16, var_1827_cast_fp16, var_1843_cast_fp16, var_1843_cast_fp16, var_1859_cast_fp16, var_1859_cast_fp16, var_1875_cast_fp16, var_1875_cast_fp16, var_1891_cast_fp16, var_1891_cast_fp16, var_1907_cast_fp16, var_1907_cast_fp16, var_1923_cast_fp16, var_1923_cast_fp16, var_1939_cast_fp16, var_1939_cast_fp16))[name = string("value_heads_cast_fp16")];
+            fp16 var_1962_to_fp16 = const()[name = string("op_1962_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1963_cast_fp16 = mul(x = mh_q_27_cast_fp16, y = var_1962_to_fp16)[name = string("op_1963_cast_fp16")];
+            bool mh_w_17_transpose_x_0 = const()[name = string("mh_w_17_transpose_x_0"), val = bool(true)];
+            bool mh_w_17_transpose_y_0 = const()[name = string("mh_w_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_17_cast_fp16 = matmul(transpose_x = mh_w_17_transpose_x_0, transpose_y = mh_w_17_transpose_y_0, x = var_1963_cast_fp16, y = key_heads_cast_fp16)[name = string("mh_w_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_cast_fp16 = add(x = mh_w_17_cast_fp16, y = var_424_cast_fp16)[name = string("mh_w_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1975_cast_fp16 = softmax(axis = var_1647, x = mh_w_cast_fp16)[name = string("op_1975_cast_fp16")];
+            bool attn_transpose_x_0 = const()[name = string("attn_transpose_x_0"), val = bool(false)];
+            bool attn_transpose_y_0 = const()[name = string("attn_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_cast_fp16 = matmul(transpose_x = attn_transpose_x_0, transpose_y = attn_transpose_y_0, x = value_heads_cast_fp16, y = var_1975_cast_fp16)[name = string("attn_cast_fp16")];
+            tensor<int32, [4]> var_1980 = const()[name = string("op_1980"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_33_cast_fp16 = reshape(shape = var_1980, x = attn_cast_fp16)[name = string("input_33_cast_fp16")];
+            string obj_pad_type_0 = const()[name = string("obj_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_strides_0 = const()[name = string("obj_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_pad_0 = const()[name = string("obj_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_dilations_0 = const()[name = string("obj_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_groups_0 = const()[name = string("obj_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67165568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69262784))))[name = string("layers_4_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_dilations_0, groups = obj_groups_0, pad = obj_pad_0, pad_type = obj_pad_type_0, strides = obj_strides_0, weight = layers_4_self_attn_o_proj_weight_to_fp16_palettized, x = input_33_cast_fp16)[name = string("obj_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_37_cast_fp16 = add(x = inputs_31_cast_fp16, y = obj_cast_fp16)[name = string("inputs_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_39_cast_fp16 = mul(x = inputs_37_cast_fp16, y = inputs_37_cast_fp16)[name = string("inputs_sq_39_cast_fp16")];
+            tensor<int32, [1]> variance_39_axes_0 = const()[name = string("variance_39_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_39_keep_dims_0 = const()[name = string("variance_39_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_39_cast_fp16 = reduce_mean(axes = variance_39_axes_0, keep_dims = variance_39_keep_dims_0, x = inputs_sq_39_cast_fp16)[name = string("variance_39_cast_fp16")];
+            fp16 var_1998_to_fp16 = const()[name = string("op_1998_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1999_cast_fp16 = add(x = variance_39_cast_fp16, y = var_1998_to_fp16)[name = string("op_1999_cast_fp16")];
+            fp32 var_2000_epsilon_0 = const()[name = string("op_2000_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2000_cast_fp16 = rsqrt(epsilon = var_2000_epsilon_0, x = var_1999_cast_fp16)[name = string("op_2000_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_47_cast_fp16 = mul(x = inputs_37_cast_fp16, y = var_2000_cast_fp16)[name = string("hidden_states_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_39_to_fp16 = const()[name = string("w_39_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69263360)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_35_cast_fp16 = mul(x = w_39_to_fp16, y = hidden_states_47_cast_fp16)[name = string("input_35_cast_fp16")];
+            string input_37_pad_type_0 = const()[name = string("input_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_37_strides_0 = const()[name = string("input_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_37_dilations_0 = const()[name = string("input_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_37_groups_0 = const()[name = string("input_37_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69265472))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72411264))))[name = string("layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_37_cast_fp16 = conv(dilations = input_37_dilations_0, groups = input_37_groups_0, pad = input_37_pad_0, pad_type = input_37_pad_type_0, strides = input_37_strides_0, weight = layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("input_37_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2014_cast_fp16 = silu(x = input_37_cast_fp16)[name = string("op_2014_cast_fp16")];
+            string var_2020_pad_type_0 = const()[name = string("op_2020_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2020_strides_0 = const()[name = string("op_2020_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2020_pad_0 = const()[name = string("op_2020_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2020_dilations_0 = const()[name = string("op_2020_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2020_groups_0 = const()[name = string("op_2020_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72411840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75557632))))[name = string("layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2020_cast_fp16 = conv(dilations = var_2020_dilations_0, groups = var_2020_groups_0, pad = var_2020_pad_0, pad_type = var_2020_pad_type_0, strides = var_2020_strides_0, weight = layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_2020_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_39_cast_fp16 = mul(x = var_2014_cast_fp16, y = var_2020_cast_fp16)[name = string("input_39_cast_fp16")];
+            string hidden_states_49_pad_type_0 = const()[name = string("hidden_states_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_49_strides_0 = const()[name = string("hidden_states_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_49_pad_0 = const()[name = string("hidden_states_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_49_dilations_0 = const()[name = string("hidden_states_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_49_groups_0 = const()[name = string("hidden_states_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_4_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75558208))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78704000))))[name = string("layers_4_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_49_cast_fp16 = conv(dilations = hidden_states_49_dilations_0, groups = hidden_states_49_groups_0, pad = hidden_states_49_pad_0, pad_type = hidden_states_49_pad_type_0, strides = hidden_states_49_strides_0, weight = layers_4_mlp_down_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_cast_fp16 = add(x = inputs_37_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_2041_to_fp16 = const()[name = string("op_2041_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2042_cast_fp16 = add(x = variance_cast_fp16, y = var_2041_to_fp16)[name = string("op_2042_cast_fp16")];
+            fp32 var_2043_epsilon_0 = const()[name = string("op_2043_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2043_cast_fp16 = rsqrt(epsilon = var_2043_epsilon_0, x = var_2042_cast_fp16)[name = string("op_2043_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_2043_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78704576)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_cast_fp16 = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_cast_fp16")];
+            string logits_1_pad_type_0 = const()[name = string("logits_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_1_strides_0 = const()[name = string("logits_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_1_pad_0 = const()[name = string("logits_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_1_dilations_0 = const()[name = string("logits_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_1_groups_0 = const()[name = string("logits_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_0_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78706688))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80803904))))[name = string("lm_heads_0_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_1_cast_fp16 = conv(dilations = logits_1_dilations_0, groups = logits_1_groups_0, pad = logits_1_pad_0, pad_type = logits_1_pad_type_0, strides = logits_1_strides_0, weight = lm_heads_0_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_1_cast_fp16")];
+            tensor<int32, [1]> var_2060_axes_0 = const()[name = string("op_2060_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2060_cast_fp16 = squeeze(axes = var_2060_axes_0, x = logits_1_cast_fp16)[name = string("op_2060_cast_fp16")];
+            string logits_3_pad_type_0 = const()[name = string("logits_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_3_strides_0 = const()[name = string("logits_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_3_pad_0 = const()[name = string("logits_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_3_dilations_0 = const()[name = string("logits_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_3_groups_0 = const()[name = string("logits_3_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80804480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82901696))))[name = string("lm_heads_1_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_3_cast_fp16 = conv(dilations = logits_3_dilations_0, groups = logits_3_groups_0, pad = logits_3_pad_0, pad_type = logits_3_pad_type_0, strides = logits_3_strides_0, weight = lm_heads_1_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_3_cast_fp16")];
+            tensor<int32, [1]> var_2076_axes_0 = const()[name = string("op_2076_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2076_cast_fp16 = squeeze(axes = var_2076_axes_0, x = logits_3_cast_fp16)[name = string("op_2076_cast_fp16")];
+            string logits_5_pad_type_0 = const()[name = string("logits_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_5_strides_0 = const()[name = string("logits_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_5_pad_0 = const()[name = string("logits_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_5_dilations_0 = const()[name = string("logits_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_5_groups_0 = const()[name = string("logits_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_2_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82902272))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84999488))))[name = string("lm_heads_2_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_5_cast_fp16 = conv(dilations = logits_5_dilations_0, groups = logits_5_groups_0, pad = logits_5_pad_0, pad_type = logits_5_pad_type_0, strides = logits_5_strides_0, weight = lm_heads_2_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_5_cast_fp16")];
+            tensor<int32, [1]> var_2092_axes_0 = const()[name = string("op_2092_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2092_cast_fp16 = squeeze(axes = var_2092_axes_0, x = logits_5_cast_fp16)[name = string("op_2092_cast_fp16")];
+            string logits_7_pad_type_0 = const()[name = string("logits_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_7_strides_0 = const()[name = string("logits_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_7_pad_0 = const()[name = string("logits_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_7_dilations_0 = const()[name = string("logits_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_7_groups_0 = const()[name = string("logits_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_3_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85000064))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87097280))))[name = string("lm_heads_3_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_7_cast_fp16 = conv(dilations = logits_7_dilations_0, groups = logits_7_groups_0, pad = logits_7_pad_0, pad_type = logits_7_pad_type_0, strides = logits_7_strides_0, weight = lm_heads_3_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_7_cast_fp16")];
+            tensor<int32, [1]> var_2108_axes_0 = const()[name = string("op_2108_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2108_cast_fp16 = squeeze(axes = var_2108_axes_0, x = logits_7_cast_fp16)[name = string("op_2108_cast_fp16")];
+            string logits_9_pad_type_0 = const()[name = string("logits_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_9_strides_0 = const()[name = string("logits_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_9_pad_0 = const()[name = string("logits_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_9_dilations_0 = const()[name = string("logits_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_9_groups_0 = const()[name = string("logits_9_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_4_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87097856))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89195072))))[name = string("lm_heads_4_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_9_cast_fp16 = conv(dilations = logits_9_dilations_0, groups = logits_9_groups_0, pad = logits_9_pad_0, pad_type = logits_9_pad_type_0, strides = logits_9_strides_0, weight = lm_heads_4_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_9_cast_fp16")];
+            tensor<int32, [1]> var_2124_axes_0 = const()[name = string("op_2124_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2124_cast_fp16 = squeeze(axes = var_2124_axes_0, x = logits_9_cast_fp16)[name = string("op_2124_cast_fp16")];
+            string logits_11_pad_type_0 = const()[name = string("logits_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_11_strides_0 = const()[name = string("logits_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_11_pad_0 = const()[name = string("logits_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_11_dilations_0 = const()[name = string("logits_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_11_groups_0 = const()[name = string("logits_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_5_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89195648))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91292864))))[name = string("lm_heads_5_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_11_cast_fp16 = conv(dilations = logits_11_dilations_0, groups = logits_11_groups_0, pad = logits_11_pad_0, pad_type = logits_11_pad_type_0, strides = logits_11_strides_0, weight = lm_heads_5_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_11_cast_fp16")];
+            tensor<int32, [1]> var_2140_axes_0 = const()[name = string("op_2140_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2140_cast_fp16 = squeeze(axes = var_2140_axes_0, x = logits_11_cast_fp16)[name = string("op_2140_cast_fp16")];
+            string logits_13_pad_type_0 = const()[name = string("logits_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_13_strides_0 = const()[name = string("logits_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_13_pad_0 = const()[name = string("logits_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_13_dilations_0 = const()[name = string("logits_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_13_groups_0 = const()[name = string("logits_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_6_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91293440))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93390656))))[name = string("lm_heads_6_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_13_cast_fp16 = conv(dilations = logits_13_dilations_0, groups = logits_13_groups_0, pad = logits_13_pad_0, pad_type = logits_13_pad_type_0, strides = logits_13_strides_0, weight = lm_heads_6_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_13_cast_fp16")];
+            tensor<int32, [1]> var_2156_axes_0 = const()[name = string("op_2156_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2156_cast_fp16 = squeeze(axes = var_2156_axes_0, x = logits_13_cast_fp16)[name = string("op_2156_cast_fp16")];
+            string logits_15_pad_type_0 = const()[name = string("logits_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_15_strides_0 = const()[name = string("logits_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_15_pad_0 = const()[name = string("logits_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_15_dilations_0 = const()[name = string("logits_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_15_groups_0 = const()[name = string("logits_15_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_7_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93391232))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95488448))))[name = string("lm_heads_7_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_15_cast_fp16 = conv(dilations = logits_15_dilations_0, groups = logits_15_groups_0, pad = logits_15_pad_0, pad_type = logits_15_pad_type_0, strides = logits_15_strides_0, weight = lm_heads_7_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_15_cast_fp16")];
+            tensor<int32, [1]> var_2172_axes_0 = const()[name = string("op_2172_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2172_cast_fp16 = squeeze(axes = var_2172_axes_0, x = logits_15_cast_fp16)[name = string("op_2172_cast_fp16")];
+            string logits_17_pad_type_0 = const()[name = string("logits_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_17_strides_0 = const()[name = string("logits_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_17_pad_0 = const()[name = string("logits_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_17_dilations_0 = const()[name = string("logits_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_17_groups_0 = const()[name = string("logits_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_8_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95489024))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97586240))))[name = string("lm_heads_8_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_17_cast_fp16 = conv(dilations = logits_17_dilations_0, groups = logits_17_groups_0, pad = logits_17_pad_0, pad_type = logits_17_pad_type_0, strides = logits_17_strides_0, weight = lm_heads_8_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_17_cast_fp16")];
+            tensor<int32, [1]> var_2188_axes_0 = const()[name = string("op_2188_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2188_cast_fp16 = squeeze(axes = var_2188_axes_0, x = logits_17_cast_fp16)[name = string("op_2188_cast_fp16")];
+            string logits_19_pad_type_0 = const()[name = string("logits_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_19_strides_0 = const()[name = string("logits_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_19_pad_0 = const()[name = string("logits_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_19_dilations_0 = const()[name = string("logits_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_19_groups_0 = const()[name = string("logits_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_9_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97586816))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99684032))))[name = string("lm_heads_9_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_19_cast_fp16 = conv(dilations = logits_19_dilations_0, groups = logits_19_groups_0, pad = logits_19_pad_0, pad_type = logits_19_pad_type_0, strides = logits_19_strides_0, weight = lm_heads_9_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_19_cast_fp16")];
+            tensor<int32, [1]> var_2204_axes_0 = const()[name = string("op_2204_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2204_cast_fp16 = squeeze(axes = var_2204_axes_0, x = logits_19_cast_fp16)[name = string("op_2204_cast_fp16")];
+            string logits_21_pad_type_0 = const()[name = string("logits_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_21_strides_0 = const()[name = string("logits_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_21_pad_0 = const()[name = string("logits_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_21_dilations_0 = const()[name = string("logits_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_21_groups_0 = const()[name = string("logits_21_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_10_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99684608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101781824))))[name = string("lm_heads_10_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_21_cast_fp16 = conv(dilations = logits_21_dilations_0, groups = logits_21_groups_0, pad = logits_21_pad_0, pad_type = logits_21_pad_type_0, strides = logits_21_strides_0, weight = lm_heads_10_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_21_cast_fp16")];
+            tensor<int32, [1]> var_2220_axes_0 = const()[name = string("op_2220_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2220_cast_fp16 = squeeze(axes = var_2220_axes_0, x = logits_21_cast_fp16)[name = string("op_2220_cast_fp16")];
+            string logits_23_pad_type_0 = const()[name = string("logits_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_23_strides_0 = const()[name = string("logits_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_23_pad_0 = const()[name = string("logits_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_23_dilations_0 = const()[name = string("logits_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_23_groups_0 = const()[name = string("logits_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_11_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101782400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103879616))))[name = string("lm_heads_11_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_23_cast_fp16 = conv(dilations = logits_23_dilations_0, groups = logits_23_groups_0, pad = logits_23_pad_0, pad_type = logits_23_pad_type_0, strides = logits_23_strides_0, weight = lm_heads_11_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_23_cast_fp16")];
+            tensor<int32, [1]> var_2236_axes_0 = const()[name = string("op_2236_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2236_cast_fp16 = squeeze(axes = var_2236_axes_0, x = logits_23_cast_fp16)[name = string("op_2236_cast_fp16")];
+            string logits_25_pad_type_0 = const()[name = string("logits_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_25_strides_0 = const()[name = string("logits_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_25_pad_0 = const()[name = string("logits_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_25_dilations_0 = const()[name = string("logits_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_25_groups_0 = const()[name = string("logits_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_12_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103880192))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105977408))))[name = string("lm_heads_12_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_25_cast_fp16 = conv(dilations = logits_25_dilations_0, groups = logits_25_groups_0, pad = logits_25_pad_0, pad_type = logits_25_pad_type_0, strides = logits_25_strides_0, weight = lm_heads_12_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_25_cast_fp16")];
+            tensor<int32, [1]> var_2252_axes_0 = const()[name = string("op_2252_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2252_cast_fp16 = squeeze(axes = var_2252_axes_0, x = logits_25_cast_fp16)[name = string("op_2252_cast_fp16")];
+            string logits_27_pad_type_0 = const()[name = string("logits_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_27_strides_0 = const()[name = string("logits_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_27_pad_0 = const()[name = string("logits_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_27_dilations_0 = const()[name = string("logits_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_27_groups_0 = const()[name = string("logits_27_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_13_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105977984))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108075200))))[name = string("lm_heads_13_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_27_cast_fp16 = conv(dilations = logits_27_dilations_0, groups = logits_27_groups_0, pad = logits_27_pad_0, pad_type = logits_27_pad_type_0, strides = logits_27_strides_0, weight = lm_heads_13_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_27_cast_fp16")];
+            tensor<int32, [1]> var_2268_axes_0 = const()[name = string("op_2268_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2268_cast_fp16 = squeeze(axes = var_2268_axes_0, x = logits_27_cast_fp16)[name = string("op_2268_cast_fp16")];
+            string logits_29_pad_type_0 = const()[name = string("logits_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_29_strides_0 = const()[name = string("logits_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_29_pad_0 = const()[name = string("logits_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_29_dilations_0 = const()[name = string("logits_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_29_groups_0 = const()[name = string("logits_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_14_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108075776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110172992))))[name = string("lm_heads_14_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_29_cast_fp16 = conv(dilations = logits_29_dilations_0, groups = logits_29_groups_0, pad = logits_29_pad_0, pad_type = logits_29_pad_type_0, strides = logits_29_strides_0, weight = lm_heads_14_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_29_cast_fp16")];
+            tensor<int32, [1]> var_2284_axes_0 = const()[name = string("op_2284_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2284_cast_fp16 = squeeze(axes = var_2284_axes_0, x = logits_29_cast_fp16)[name = string("op_2284_cast_fp16")];
+            bool var_2290_interleave_0 = const()[name = string("op_2290_interleave_0"), val = bool(false)];
+            int32 const_119 = const()[name = string("const_119"), val = int32(2)];
+            tensor<fp16, [1, 2048, 15]> var_2290_cast_fp16 = concat(axis = const_119, interleave = var_2290_interleave_0, values = (var_2060_cast_fp16, var_2076_cast_fp16, var_2092_cast_fp16, var_2108_cast_fp16, var_2124_cast_fp16, var_2140_cast_fp16, var_2156_cast_fp16, var_2172_cast_fp16, var_2188_cast_fp16, var_2204_cast_fp16, var_2220_cast_fp16, var_2236_cast_fp16, var_2252_cast_fp16, var_2268_cast_fp16, var_2284_cast_fp16))[name = string("op_2290_cast_fp16")];
+            int32 var_2292 = const()[name = string("op_2292"), val = int32(1)];
+            bool var_2293_interleave_0 = const()[name = string("op_2293_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> key_cache_updates = concat(axis = var_2292, interleave = var_2293_interleave_0, values = (current_key_3_cast_fp16, current_key_7_cast_fp16, current_key_11_cast_fp16, current_key_15_cast_fp16, current_key_cast_fp16))[name = string("op_2293_cast_fp16")];
+            int32 var_2295 = const()[name = string("op_2295"), val = int32(1)];
+            bool var_2296_interleave_0 = const()[name = string("op_2296_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> value_cache_updates = concat(axis = var_2295, interleave = var_2296_interleave_0, values = (current_value_1_cast_fp16, current_value_3_cast_fp16, current_value_5_cast_fp16, current_value_7_cast_fp16, current_value_cast_fp16))[name = string("op_2296_cast_fp16")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [1, 15, 2048]> all_logits = transpose(perm = transpose_0_perm_0, x = var_2290_cast_fp16)[name = string("transpose_0")];
+        } -> (all_logits, key_cache_updates, value_cache_updates);
+}
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/weights/weight.bin b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c15cd537971e2042c4db1edfed5332b7fb0a33d9
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-0.6b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:469c3db16d54f85f771d5fbb6817f86c26a92c19e1545e0a452edaeda09f7a59
+size 110173568
diff --git a/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7f421e909846a396ab82c06e5f979621d7398c3f
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90d4090f7d667dc9e55ca0698f543928678e69a645d71fc343c52e7398724f65
+size 243
diff --git a/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/coremldata.bin b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3d21d6852966d6df281e9ba4468d47d15884a653
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91e4d1bd298329d7300432d70c3054ccc58f5f250cb500a44f0b7d4075720186
+size 611
diff --git a/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/metadata.json b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e366d13d61d0cee81a23db322e0b4fb3873a8c24
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/metadata.json
@@ -0,0 +1,151 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 15 × 2048)",
+        "shortDescription" : "",
+        "shape" : "[1, 15, 2048]",
+        "name" : "all_logits",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 1]",
+        "name" : "key_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 1]",
+        "name" : "value_cache_updates",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 8,
+      "Ios18.softmax" : 5,
+      "Ios18.mul" : 123,
+      "Ios18.matmul" : 10,
+      "Ios18.rsqrt" : 21,
+      "Ios16.reduceMean" : 21,
+      "Split" : 2,
+      "Ios18.greaterEqual" : 2,
+      "Select" : 2,
+      "Ios18.gather" : 2,
+      "Ios18.add" : 58,
+      "Ios18.reshape" : 40,
+      "Ios18.constexprLutToDense" : 51,
+      "Ios18.conv" : 51,
+      "Ios18.concat" : 23,
+      "Ios18.cast" : 5,
+      "Ios18.sub" : 1,
+      "Ios18.silu" : 5,
+      "Ios18.transpose" : 1,
+      "Ios18.sliceByIndex" : 100,
+      "Ios18.squeeze" : 15
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-11",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2048 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 2048, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 16]",
+        "name" : "key_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 5120 × 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 5120, 1, 16]",
+        "name" : "value_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 16]",
+        "name" : "kv_cache_update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 16]",
+        "name" : "key_padding_mask",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "MultiCodeDecoder_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..332e59eacb52140ca8f06b0bd950463809599b3c
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/model.mil
@@ -0,0 +1,1377 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> cache_length, tensor<fp16, [1, 2048, 1, 1]> input_embeds, tensor<fp16, [1, 5120, 1, 16]> key_cache, tensor<fp16, [1, 16]> key_padding_mask, tensor<fp16, [1, 16]> kv_cache_update_mask, tensor<fp16, [1, 5120, 1, 16]> value_cache) {
+            string inputs_1_pad_type_0 = const()[name = string("inputs_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> inputs_1_strides_0 = const()[name = string("inputs_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> inputs_1_pad_0 = const()[name = string("inputs_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> inputs_1_dilations_0 = const()[name = string("inputs_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 inputs_1_groups_0 = const()[name = string("inputs_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> input_projection_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2097280))))[name = string("input_projection_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> input_projection_bias_to_fp16 = const()[name = string("input_projection_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2097856)))];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_1_cast_fp16 = conv(bias = input_projection_bias_to_fp16, dilations = inputs_1_dilations_0, groups = inputs_1_groups_0, pad = inputs_1_pad_0, pad_type = inputs_1_pad_type_0, strides = inputs_1_strides_0, weight = input_projection_weight_to_fp16_palettized, x = input_embeds)[name = string("inputs_1_cast_fp16")];
+            int32 pos_cos_batch_dims_0 = const()[name = string("pos_cos_batch_dims_0"), val = int32(0)];
+            bool pos_cos_validate_indices_0 = const()[name = string("pos_cos_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_cos_weight_to_fp16 = const()[name = string("position_embeddings_cos_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2099968)))];
+            string cache_length_to_int16_dtype_0 = const()[name = string("cache_length_to_int16_dtype_0"), val = string("int16")];
+            string cast_111_dtype_0 = const()[name = string("cast_111_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> cache_length_to_int16 = cast(dtype = cache_length_to_int16_dtype_0, x = cache_length)[name = string("cast_5")];
+            tensor<int32, [1]> cast_111 = cast(dtype = cast_111_dtype_0, x = cache_length_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_111, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(16)];
+            tensor<int32, [1]> add_0 = add(x = cast_111, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_111, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            string cast_0_dtype_0 = const()[name = string("cast_0_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<int32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = select_0_to_int16)[name = string("cast_2")];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = cast_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(16)];
+            tensor<int32, [1]> add_0_1 = add(x = cast_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = cast_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0 = const()[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 128]> pos_cos_cast_fp16_cast_uint16_cast_uint16 = gather(axis = pos_cos_cast_fp16_cast_uint16_cast_uint16_axis_0, batch_dims = pos_cos_batch_dims_0, indices = select_0_1, validate_indices = pos_cos_validate_indices_0, x = position_embeddings_cos_weight_to_fp16)[name = string("pos_cos_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> obj_7_axes_0 = const()[name = string("obj_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_7_cast_fp16 = expand_dims(axes = obj_7_axes_0, x = pos_cos_cast_fp16_cast_uint16_cast_uint16)[name = string("obj_7_cast_fp16")];
+            int32 pos_sin_axis_0 = const()[name = string("pos_sin_axis_0"), val = int32(0)];
+            int32 pos_sin_batch_dims_0 = const()[name = string("pos_sin_batch_dims_0"), val = int32(0)];
+            bool pos_sin_validate_indices_0 = const()[name = string("pos_sin_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [16, 128]> position_embeddings_sin_weight_to_fp16 = const()[name = string("position_embeddings_sin_weight_to_fp16"), val = tensor<fp16, [16, 128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2104128)))];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_1")];
+            tensor<fp16, [1, 128]> pos_sin_cast_fp16_cast_uint16 = gather(axis = pos_sin_axis_0, batch_dims = pos_sin_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_validate_indices_0, x = position_embeddings_sin_weight_to_fp16)[name = string("pos_sin_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> obj_9_axes_0 = const()[name = string("obj_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 128, 1]> obj_9_cast_fp16 = expand_dims(axes = obj_9_axes_0, x = pos_sin_cast_fp16_cast_uint16)[name = string("obj_9_cast_fp16")];
+            tensor<int32, [5]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_96_axis_0 = const()[name = string("op_96_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_96_cast_fp16_4 = split(axis = var_96_axis_0, split_sizes = tile_0, x = key_cache)[name = string("op_96_cast_fp16")];
+            tensor<int32, [5]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [5]>([1024, 1024, 1024, 1024, 1024])];
+            int32 var_104_axis_0 = const()[name = string("op_104_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_0, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_1, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_2, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_3, tensor<fp16, [1, 1024, 1, 16]> var_104_cast_fp16_4 = split(axis = var_104_axis_0, split_sizes = tile_1, x = value_cache)[name = string("op_104_cast_fp16")];
+            int32 var_111 = const()[name = string("op_111"), val = int32(3)];
+            int32 var_121 = const()[name = string("op_121"), val = int32(-2)];
+            int32 var_129 = const()[name = string("op_129"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_141_to_fp16 = const()[name = string("op_141_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_142_cast_fp16 = add(x = variance_1_cast_fp16, y = var_141_to_fp16)[name = string("op_142_cast_fp16")];
+            fp32 var_143_epsilon_0 = const()[name = string("op_143_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_143_cast_fp16 = rsqrt(epsilon = var_143_epsilon_0, x = var_142_cast_fp16)[name = string("op_143_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_143_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2108288)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_1_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("obj_1_cast_fp16")];
+            string query_1_pad_type_0 = const()[name = string("query_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_1_strides_0 = const()[name = string("query_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_1_pad_0 = const()[name = string("query_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_1_dilations_0 = const()[name = string("query_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_1_groups_0 = const()[name = string("query_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2110400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4207616))))[name = string("layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> layers_0_self_attn_q_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_q_proj_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4208192)))];
+            tensor<fp16, [1, 2048, 1, 1]> query_1_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_1_dilations_0, groups = query_1_groups_0, pad = query_1_pad_0, pad_type = query_1_pad_type_0, strides = query_1_strides_0, weight = layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("query_1_cast_fp16")];
+            string current_key_1_pad_type_0 = const()[name = string("current_key_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_1_strides_0 = const()[name = string("current_key_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_1_pad_0 = const()[name = string("current_key_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_1_dilations_0 = const()[name = string("current_key_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_1_groups_0 = const()[name = string("current_key_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4212352))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5260992))))[name = string("layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = conv(dilations = current_key_1_dilations_0, groups = current_key_1_groups_0, pad = current_key_1_pad_0, pad_type = current_key_1_pad_type_0, strides = current_key_1_strides_0, weight = layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            string current_value_1_pad_type_0 = const()[name = string("current_value_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_1_strides_0 = const()[name = string("current_value_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_1_pad_0 = const()[name = string("current_value_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_1_dilations_0 = const()[name = string("current_value_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_1_groups_0 = const()[name = string("current_value_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5261568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6310208))))[name = string("layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> layers_0_self_attn_v_proj_bias_to_fp16 = const()[name = string("layers_0_self_attn_v_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6310784)))];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_1_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_1_dilations_0, groups = current_value_1_groups_0, pad = current_value_1_pad_0, pad_type = current_value_1_pad_type_0, strides = current_value_1_strides_0, weight = layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = string("current_value_1_cast_fp16")];
+            tensor<int32, [4]> var_180 = const()[name = string("op_180"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_3_cast_fp16 = reshape(shape = var_180, x = query_1_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_186_to_fp16 = const()[name = string("op_186_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_187_cast_fp16 = add(x = variance_3_cast_fp16, y = var_186_to_fp16)[name = string("op_187_cast_fp16")];
+            fp32 var_188_epsilon_0 = const()[name = string("op_188_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_188_cast_fp16 = rsqrt(epsilon = var_188_epsilon_0, x = var_187_cast_fp16)[name = string("op_188_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_188_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6312896)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_1_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("query_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_196 = const()[name = string("op_196"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_5_cast_fp16 = reshape(shape = var_196, x = current_key_1_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_202_to_fp16 = const()[name = string("op_202_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_203_cast_fp16 = add(x = variance_5_cast_fp16, y = var_202_to_fp16)[name = string("op_203_cast_fp16")];
+            fp32 var_204_epsilon_0 = const()[name = string("op_204_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_204_cast_fp16 = rsqrt(epsilon = var_204_epsilon_0, x = var_203_cast_fp16)[name = string("op_204_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_204_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6313216)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_1_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("current_key_normed_1_cast_fp16")];
+            tensor<int32, [4]> var_222 = const()[name = string("op_222"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_1_cast_fp16 = reshape(shape = var_222, x = query_normed_1_cast_fp16)[name = string("mh_q_1_cast_fp16")];
+            tensor<int32, [4]> var_224 = const()[name = string("op_224"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_1_cast_fp16 = reshape(shape = var_224, x = current_key_normed_1_cast_fp16)[name = string("mh_k_1_cast_fp16")];
+            tensor<int32, [1]> cos_1_axes_0 = const()[name = string("cos_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> cos_1_cast_fp16 = expand_dims(axes = cos_1_axes_0, x = obj_7_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [1]> sin_1_axes_0 = const()[name = string("sin_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 128, 1]> sin_1_cast_fp16 = expand_dims(axes = sin_1_axes_0, x = obj_9_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_228_cast_fp16 = mul(x = mh_q_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_228_cast_fp16")];
+            tensor<int32, [4]> var_233_begin_0 = const()[name = string("op_233_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_233_end_0 = const()[name = string("op_233_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_233_end_mask_0 = const()[name = string("op_233_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_233_cast_fp16 = slice_by_index(begin = var_233_begin_0, end = var_233_end_0, end_mask = var_233_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_233_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = string("op_239_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = string("op_239_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = string("op_239_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = mh_q_1_cast_fp16)[name = string("op_239_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_241_cast_fp16 = mul(x = var_239_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_241_cast_fp16")];
+            bool var_243_interleave_0 = const()[name = string("op_243_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_243_cast_fp16 = concat(axis = var_121, interleave = var_243_interleave_0, values = (var_241_cast_fp16, var_233_cast_fp16))[name = string("op_243_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_244_cast_fp16 = mul(x = var_243_cast_fp16, y = sin_1_cast_fp16)[name = string("op_244_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_3_cast_fp16 = add(x = var_228_cast_fp16, y = var_244_cast_fp16)[name = string("mh_q_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_246_cast_fp16 = mul(x = mh_k_1_cast_fp16, y = cos_1_cast_fp16)[name = string("op_246_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = string("op_251_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = string("op_251_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = string("op_251_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_251_cast_fp16")];
+            tensor<int32, [4]> var_257_begin_0 = const()[name = string("op_257_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_257_end_0 = const()[name = string("op_257_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_257_end_mask_0 = const()[name = string("op_257_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_257_cast_fp16 = slice_by_index(begin = var_257_begin_0, end = var_257_end_0, end_mask = var_257_end_mask_0, x = mh_k_1_cast_fp16)[name = string("op_257_cast_fp16")];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_259_cast_fp16 = mul(x = var_257_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_259_cast_fp16")];
+            bool var_261_interleave_0 = const()[name = string("op_261_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_261_cast_fp16 = concat(axis = var_121, interleave = var_261_interleave_0, values = (var_259_cast_fp16, var_251_cast_fp16))[name = string("op_261_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_262_cast_fp16 = mul(x = var_261_cast_fp16, y = sin_1_cast_fp16)[name = string("op_262_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_3_cast_fp16 = add(x = var_246_cast_fp16, y = var_262_cast_fp16)[name = string("mh_k_3_cast_fp16")];
+            tensor<int32, [4]> var_266 = const()[name = string("op_266"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_266, x = mh_k_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<int32, [1]> var_269_axes_0 = const()[name = string("op_269_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_269_cast_fp16 = expand_dims(axes = var_269_axes_0, x = kv_cache_update_mask)[name = string("op_269_cast_fp16")];
+            tensor<int32, [1]> var_270_axes_0 = const()[name = string("op_270_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_270_cast_fp16 = expand_dims(axes = var_270_axes_0, x = var_269_cast_fp16)[name = string("op_270_cast_fp16")];
+            fp16 var_122_to_fp16 = const()[name = string("op_122_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 16]> var_272_cast_fp16 = sub(x = var_122_to_fp16, y = var_270_cast_fp16)[name = string("op_272_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_273_cast_fp16 = mul(x = var_96_cast_fp16_0, y = var_272_cast_fp16)[name = string("op_273_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_274_cast_fp16 = mul(x = current_key_3_cast_fp16, y = var_270_cast_fp16)[name = string("op_274_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_3_cast_fp16 = add(x = var_273_cast_fp16, y = var_274_cast_fp16)[name = string("key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_277_cast_fp16 = mul(x = var_104_cast_fp16_0, y = var_272_cast_fp16)[name = string("op_277_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_278_cast_fp16 = mul(x = current_value_1_cast_fp16, y = var_270_cast_fp16)[name = string("op_278_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_1_cast_fp16 = add(x = var_277_cast_fp16, y = var_278_cast_fp16)[name = string("value_1_cast_fp16")];
+            tensor<int32, [4]> var_282 = const()[name = string("op_282"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_1_cast_fp16 = reshape(shape = var_282, x = key_3_cast_fp16)[name = string("key_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_284 = const()[name = string("op_284"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_1_cast_fp16 = reshape(shape = var_284, x = value_1_cast_fp16)[name = string("value_heads_1_cast_fp16")];
+            tensor<int32, [4]> var_287_begin_0 = const()[name = string("op_287_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_287_end_0 = const()[name = string("op_287_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_287_end_mask_0 = const()[name = string("op_287_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_287_cast_fp16 = slice_by_index(begin = var_287_begin_0, end = var_287_end_0, end_mask = var_287_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_287_cast_fp16")];
+            tensor<int32, [4]> var_291_begin_0 = const()[name = string("op_291_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_291_end_0 = const()[name = string("op_291_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_291_end_mask_0 = const()[name = string("op_291_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_291_cast_fp16 = slice_by_index(begin = var_291_begin_0, end = var_291_end_0, end_mask = var_291_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_291_cast_fp16")];
+            tensor<int32, [4]> var_303_begin_0 = const()[name = string("op_303_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_303_end_0 = const()[name = string("op_303_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_303_end_mask_0 = const()[name = string("op_303_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_303_cast_fp16 = slice_by_index(begin = var_303_begin_0, end = var_303_end_0, end_mask = var_303_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_303_cast_fp16")];
+            tensor<int32, [4]> var_307_begin_0 = const()[name = string("op_307_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_307_end_0 = const()[name = string("op_307_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_307_end_mask_0 = const()[name = string("op_307_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_307_cast_fp16 = slice_by_index(begin = var_307_begin_0, end = var_307_end_0, end_mask = var_307_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_307_cast_fp16")];
+            tensor<int32, [4]> var_319_begin_0 = const()[name = string("op_319_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_319_end_0 = const()[name = string("op_319_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_319_end_mask_0 = const()[name = string("op_319_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_319_cast_fp16 = slice_by_index(begin = var_319_begin_0, end = var_319_end_0, end_mask = var_319_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_319_cast_fp16")];
+            tensor<int32, [4]> var_323_begin_0 = const()[name = string("op_323_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_323_end_0 = const()[name = string("op_323_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_323_end_mask_0 = const()[name = string("op_323_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_323_cast_fp16 = slice_by_index(begin = var_323_begin_0, end = var_323_end_0, end_mask = var_323_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_323_cast_fp16")];
+            tensor<int32, [4]> var_335_begin_0 = const()[name = string("op_335_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_335_end_0 = const()[name = string("op_335_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_335_end_mask_0 = const()[name = string("op_335_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_335_cast_fp16 = slice_by_index(begin = var_335_begin_0, end = var_335_end_0, end_mask = var_335_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_335_cast_fp16")];
+            tensor<int32, [4]> var_339_begin_0 = const()[name = string("op_339_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_339_end_0 = const()[name = string("op_339_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_339_end_mask_0 = const()[name = string("op_339_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_339_cast_fp16 = slice_by_index(begin = var_339_begin_0, end = var_339_end_0, end_mask = var_339_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_339_cast_fp16")];
+            tensor<int32, [4]> var_351_begin_0 = const()[name = string("op_351_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_351_end_0 = const()[name = string("op_351_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_351_end_mask_0 = const()[name = string("op_351_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_351_cast_fp16 = slice_by_index(begin = var_351_begin_0, end = var_351_end_0, end_mask = var_351_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_351_cast_fp16")];
+            tensor<int32, [4]> var_355_begin_0 = const()[name = string("op_355_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_355_end_0 = const()[name = string("op_355_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_355_end_mask_0 = const()[name = string("op_355_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_355_cast_fp16 = slice_by_index(begin = var_355_begin_0, end = var_355_end_0, end_mask = var_355_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_355_cast_fp16")];
+            tensor<int32, [4]> var_367_begin_0 = const()[name = string("op_367_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_367_end_0 = const()[name = string("op_367_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_367_end_mask_0 = const()[name = string("op_367_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_367_cast_fp16 = slice_by_index(begin = var_367_begin_0, end = var_367_end_0, end_mask = var_367_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_367_cast_fp16")];
+            tensor<int32, [4]> var_371_begin_0 = const()[name = string("op_371_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_371_end_0 = const()[name = string("op_371_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_371_end_mask_0 = const()[name = string("op_371_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_371_cast_fp16 = slice_by_index(begin = var_371_begin_0, end = var_371_end_0, end_mask = var_371_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_371_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = string("op_383_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = string("op_383_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = string("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_383_cast_fp16")];
+            tensor<int32, [4]> var_387_begin_0 = const()[name = string("op_387_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_387_end_0 = const()[name = string("op_387_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_387_end_mask_0 = const()[name = string("op_387_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_387_cast_fp16 = slice_by_index(begin = var_387_begin_0, end = var_387_end_0, end_mask = var_387_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_387_cast_fp16")];
+            tensor<int32, [4]> var_399_begin_0 = const()[name = string("op_399_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_399_end_0 = const()[name = string("op_399_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_399_end_mask_0 = const()[name = string("op_399_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_399_cast_fp16 = slice_by_index(begin = var_399_begin_0, end = var_399_end_0, end_mask = var_399_end_mask_0, x = key_heads_1_cast_fp16)[name = string("op_399_cast_fp16")];
+            tensor<int32, [4]> var_403_begin_0 = const()[name = string("op_403_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_403_end_0 = const()[name = string("op_403_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_403_end_mask_0 = const()[name = string("op_403_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_403_cast_fp16 = slice_by_index(begin = var_403_begin_0, end = var_403_end_0, end_mask = var_403_end_mask_0, x = value_heads_1_cast_fp16)[name = string("op_403_cast_fp16")];
+            bool key_heads_3_interleave_0 = const()[name = string("key_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_3_cast_fp16 = concat(axis = var_129, interleave = key_heads_3_interleave_0, values = (var_287_cast_fp16, var_287_cast_fp16, var_303_cast_fp16, var_303_cast_fp16, var_319_cast_fp16, var_319_cast_fp16, var_335_cast_fp16, var_335_cast_fp16, var_351_cast_fp16, var_351_cast_fp16, var_367_cast_fp16, var_367_cast_fp16, var_383_cast_fp16, var_383_cast_fp16, var_399_cast_fp16, var_399_cast_fp16))[name = string("key_heads_3_cast_fp16")];
+            bool value_heads_3_interleave_0 = const()[name = string("value_heads_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_3_cast_fp16 = concat(axis = var_129, interleave = value_heads_3_interleave_0, values = (var_291_cast_fp16, var_291_cast_fp16, var_307_cast_fp16, var_307_cast_fp16, var_323_cast_fp16, var_323_cast_fp16, var_339_cast_fp16, var_339_cast_fp16, var_355_cast_fp16, var_355_cast_fp16, var_371_cast_fp16, var_371_cast_fp16, var_387_cast_fp16, var_387_cast_fp16, var_403_cast_fp16, var_403_cast_fp16))[name = string("value_heads_3_cast_fp16")];
+            fp16 var_426_to_fp16 = const()[name = string("op_426_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_427_cast_fp16 = mul(x = mh_q_3_cast_fp16, y = var_426_to_fp16)[name = string("op_427_cast_fp16")];
+            bool mh_w_1_transpose_x_0 = const()[name = string("mh_w_1_transpose_x_0"), val = bool(true)];
+            bool mh_w_1_transpose_y_0 = const()[name = string("mh_w_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_1_cast_fp16 = matmul(transpose_x = mh_w_1_transpose_x_0, transpose_y = mh_w_1_transpose_y_0, x = var_427_cast_fp16, y = key_heads_3_cast_fp16)[name = string("mh_w_1_cast_fp16")];
+            tensor<int32, [1]> var_435_axes_0 = const()[name = string("op_435_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 16]> var_435_cast_fp16 = expand_dims(axes = var_435_axes_0, x = key_padding_mask)[name = string("op_435_cast_fp16")];
+            tensor<int32, [1]> var_436_axes_0 = const()[name = string("op_436_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 16]> var_436_cast_fp16 = expand_dims(axes = var_436_axes_0, x = var_435_cast_fp16)[name = string("op_436_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_3_cast_fp16 = add(x = mh_w_1_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_3_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_439_cast_fp16 = softmax(axis = var_111, x = mh_w_3_cast_fp16)[name = string("op_439_cast_fp16")];
+            bool attn_1_transpose_x_0 = const()[name = string("attn_1_transpose_x_0"), val = bool(false)];
+            bool attn_1_transpose_y_0 = const()[name = string("attn_1_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = value_heads_3_cast_fp16, y = var_439_cast_fp16)[name = string("attn_1_cast_fp16")];
+            tensor<int32, [4]> var_444 = const()[name = string("op_444"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = reshape(shape = var_444, x = attn_1_cast_fp16)[name = string("input_1_cast_fp16")];
+            string obj_11_pad_type_0 = const()[name = string("obj_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_11_strides_0 = const()[name = string("obj_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_11_pad_0 = const()[name = string("obj_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_11_dilations_0 = const()[name = string("obj_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_11_groups_0 = const()[name = string("obj_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_0_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6313536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8410752))))[name = string("layers_0_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_11_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_11_dilations_0, groups = obj_11_groups_0, pad = obj_11_pad_0, pad_type = obj_11_pad_type_0, strides = obj_11_strides_0, weight = layers_0_self_attn_o_proj_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("obj_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_1_cast_fp16, y = obj_11_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_462_to_fp16 = const()[name = string("op_462_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_463_cast_fp16 = add(x = variance_7_cast_fp16, y = var_462_to_fp16)[name = string("op_463_cast_fp16")];
+            fp32 var_464_epsilon_0 = const()[name = string("op_464_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_464_cast_fp16 = rsqrt(epsilon = var_464_epsilon_0, x = var_463_cast_fp16)[name = string("op_464_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_464_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8411328)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_3_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_3_cast_fp16")];
+            string input_5_pad_type_0 = const()[name = string("input_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_5_strides_0 = const()[name = string("input_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_5_pad_0 = const()[name = string("input_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_5_dilations_0 = const()[name = string("input_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_5_groups_0 = const()[name = string("input_5_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8413440))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11559232))))[name = string("layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_5_cast_fp16 = conv(dilations = input_5_dilations_0, groups = input_5_groups_0, pad = input_5_pad_0, pad_type = input_5_pad_type_0, strides = input_5_strides_0, weight = layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("input_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_478_cast_fp16 = silu(x = input_5_cast_fp16)[name = string("op_478_cast_fp16")];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_484_strides_0 = const()[name = string("op_484_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_484_dilations_0 = const()[name = string("op_484_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_484_groups_0 = const()[name = string("op_484_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11559808))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14705600))))[name = string("layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_484_cast_fp16 = conv(dilations = var_484_dilations_0, groups = var_484_groups_0, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_484_strides_0, weight = layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("op_484_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_7_cast_fp16 = mul(x = var_478_cast_fp16, y = var_484_cast_fp16)[name = string("input_7_cast_fp16")];
+            string hidden_states_9_pad_type_0 = const()[name = string("hidden_states_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_9_strides_0 = const()[name = string("hidden_states_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_9_pad_0 = const()[name = string("hidden_states_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_9_dilations_0 = const()[name = string("hidden_states_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_9_groups_0 = const()[name = string("hidden_states_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_0_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14706176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17851968))))[name = string("layers_0_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_9_cast_fp16 = conv(dilations = hidden_states_9_dilations_0, groups = hidden_states_9_groups_0, pad = hidden_states_9_pad_0, pad_type = hidden_states_9_pad_type_0, strides = hidden_states_9_strides_0, weight = layers_0_mlp_down_proj_weight_to_fp16_palettized, x = input_7_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_9_cast_fp16 = add(x = inputs_7_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            int32 var_498 = const()[name = string("op_498"), val = int32(3)];
+            int32 var_508 = const()[name = string("op_508"), val = int32(-2)];
+            int32 var_516 = const()[name = string("op_516"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_528_to_fp16 = const()[name = string("op_528_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_529_cast_fp16 = add(x = variance_9_cast_fp16, y = var_528_to_fp16)[name = string("op_529_cast_fp16")];
+            fp32 var_530_epsilon_0 = const()[name = string("op_530_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_530_cast_fp16 = rsqrt(epsilon = var_530_epsilon_0, x = var_529_cast_fp16)[name = string("op_530_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_530_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17852544)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_13_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_11_cast_fp16)[name = string("obj_13_cast_fp16")];
+            string query_7_pad_type_0 = const()[name = string("query_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_7_strides_0 = const()[name = string("query_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_7_pad_0 = const()[name = string("query_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_7_dilations_0 = const()[name = string("query_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_7_groups_0 = const()[name = string("query_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17854656))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19951872))))[name = string("layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_7_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_7_dilations_0, groups = query_7_groups_0, pad = query_7_pad_0, pad_type = query_7_pad_type_0, strides = query_7_strides_0, weight = layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("query_7_cast_fp16")];
+            string current_key_5_pad_type_0 = const()[name = string("current_key_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_5_strides_0 = const()[name = string("current_key_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_5_pad_0 = const()[name = string("current_key_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_5_dilations_0 = const()[name = string("current_key_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_5_groups_0 = const()[name = string("current_key_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19952448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21001088))))[name = string("layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = conv(dilations = current_key_5_dilations_0, groups = current_key_5_groups_0, pad = current_key_5_pad_0, pad_type = current_key_5_pad_type_0, strides = current_key_5_strides_0, weight = layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            string current_value_3_pad_type_0 = const()[name = string("current_value_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_3_strides_0 = const()[name = string("current_value_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_3_pad_0 = const()[name = string("current_value_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_3_dilations_0 = const()[name = string("current_value_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_3_groups_0 = const()[name = string("current_value_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21001664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22050304))))[name = string("layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_3_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_3_dilations_0, groups = current_value_3_groups_0, pad = current_value_3_pad_0, pad_type = current_value_3_pad_type_0, strides = current_value_3_strides_0, weight = layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = obj_13_cast_fp16)[name = string("current_value_3_cast_fp16")];
+            tensor<int32, [4]> var_567 = const()[name = string("op_567"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_11_cast_fp16 = reshape(shape = var_567, x = query_7_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_573_to_fp16 = const()[name = string("op_573_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_574_cast_fp16 = add(x = variance_11_cast_fp16, y = var_573_to_fp16)[name = string("op_574_cast_fp16")];
+            fp32 var_575_epsilon_0 = const()[name = string("op_575_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_575_cast_fp16 = rsqrt(epsilon = var_575_epsilon_0, x = var_574_cast_fp16)[name = string("op_575_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_575_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22050880)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_3_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_13_cast_fp16)[name = string("query_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_583 = const()[name = string("op_583"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_13_cast_fp16 = reshape(shape = var_583, x = current_key_5_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_589_to_fp16 = const()[name = string("op_589_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_590_cast_fp16 = add(x = variance_13_cast_fp16, y = var_589_to_fp16)[name = string("op_590_cast_fp16")];
+            fp32 var_591_epsilon_0 = const()[name = string("op_591_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_591_cast_fp16 = rsqrt(epsilon = var_591_epsilon_0, x = var_590_cast_fp16)[name = string("op_591_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_591_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22051200)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_3_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_15_cast_fp16)[name = string("current_key_normed_3_cast_fp16")];
+            tensor<int32, [4]> var_609 = const()[name = string("op_609"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_7_cast_fp16 = reshape(shape = var_609, x = query_normed_3_cast_fp16)[name = string("mh_q_7_cast_fp16")];
+            tensor<int32, [4]> var_611 = const()[name = string("op_611"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_5_cast_fp16 = reshape(shape = var_611, x = current_key_normed_3_cast_fp16)[name = string("mh_k_5_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_615_cast_fp16 = mul(x = mh_q_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_615_cast_fp16")];
+            tensor<int32, [4]> var_620_begin_0 = const()[name = string("op_620_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_620_end_0 = const()[name = string("op_620_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_620_end_mask_0 = const()[name = string("op_620_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_620_cast_fp16 = slice_by_index(begin = var_620_begin_0, end = var_620_end_0, end_mask = var_620_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_620_cast_fp16")];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_626_cast_fp16 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = mh_q_7_cast_fp16)[name = string("op_626_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_628_cast_fp16 = mul(x = var_626_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_628_cast_fp16")];
+            bool var_630_interleave_0 = const()[name = string("op_630_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_630_cast_fp16 = concat(axis = var_508, interleave = var_630_interleave_0, values = (var_628_cast_fp16, var_620_cast_fp16))[name = string("op_630_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_631_cast_fp16 = mul(x = var_630_cast_fp16, y = sin_1_cast_fp16)[name = string("op_631_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_9_cast_fp16 = add(x = var_615_cast_fp16, y = var_631_cast_fp16)[name = string("mh_q_9_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_633_cast_fp16 = mul(x = mh_k_5_cast_fp16, y = cos_1_cast_fp16)[name = string("op_633_cast_fp16")];
+            tensor<int32, [4]> var_638_begin_0 = const()[name = string("op_638_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_638_end_0 = const()[name = string("op_638_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_638_end_mask_0 = const()[name = string("op_638_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_638_cast_fp16 = slice_by_index(begin = var_638_begin_0, end = var_638_end_0, end_mask = var_638_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_638_cast_fp16")];
+            tensor<int32, [4]> var_644_begin_0 = const()[name = string("op_644_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_644_end_0 = const()[name = string("op_644_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_644_end_mask_0 = const()[name = string("op_644_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_644_cast_fp16 = slice_by_index(begin = var_644_begin_0, end = var_644_end_0, end_mask = var_644_end_mask_0, x = mh_k_5_cast_fp16)[name = string("op_644_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_646_cast_fp16 = mul(x = var_644_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_646_cast_fp16")];
+            bool var_648_interleave_0 = const()[name = string("op_648_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_648_cast_fp16 = concat(axis = var_508, interleave = var_648_interleave_0, values = (var_646_cast_fp16, var_638_cast_fp16))[name = string("op_648_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_649_cast_fp16 = mul(x = var_648_cast_fp16, y = sin_1_cast_fp16)[name = string("op_649_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_7_cast_fp16 = add(x = var_633_cast_fp16, y = var_649_cast_fp16)[name = string("mh_k_7_cast_fp16")];
+            tensor<int32, [4]> var_653 = const()[name = string("op_653"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_653, x = mh_k_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_660_cast_fp16 = mul(x = var_96_cast_fp16_1, y = var_272_cast_fp16)[name = string("op_660_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_661_cast_fp16 = mul(x = current_key_7_cast_fp16, y = var_270_cast_fp16)[name = string("op_661_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_9_cast_fp16 = add(x = var_660_cast_fp16, y = var_661_cast_fp16)[name = string("key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_664_cast_fp16 = mul(x = var_104_cast_fp16_1, y = var_272_cast_fp16)[name = string("op_664_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_665_cast_fp16 = mul(x = current_value_3_cast_fp16, y = var_270_cast_fp16)[name = string("op_665_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_5_cast_fp16 = add(x = var_664_cast_fp16, y = var_665_cast_fp16)[name = string("value_5_cast_fp16")];
+            tensor<int32, [4]> var_669 = const()[name = string("op_669"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_5_cast_fp16 = reshape(shape = var_669, x = key_9_cast_fp16)[name = string("key_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_671 = const()[name = string("op_671"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_5_cast_fp16 = reshape(shape = var_671, x = value_5_cast_fp16)[name = string("value_heads_5_cast_fp16")];
+            tensor<int32, [4]> var_674_begin_0 = const()[name = string("op_674_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_674_end_0 = const()[name = string("op_674_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_674_end_mask_0 = const()[name = string("op_674_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_674_cast_fp16 = slice_by_index(begin = var_674_begin_0, end = var_674_end_0, end_mask = var_674_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_674_cast_fp16")];
+            tensor<int32, [4]> var_678_begin_0 = const()[name = string("op_678_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_678_end_0 = const()[name = string("op_678_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_678_end_mask_0 = const()[name = string("op_678_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_678_cast_fp16 = slice_by_index(begin = var_678_begin_0, end = var_678_end_0, end_mask = var_678_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_678_cast_fp16")];
+            tensor<int32, [4]> var_690_begin_0 = const()[name = string("op_690_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_690_end_0 = const()[name = string("op_690_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_690_end_mask_0 = const()[name = string("op_690_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_690_cast_fp16 = slice_by_index(begin = var_690_begin_0, end = var_690_end_0, end_mask = var_690_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_690_cast_fp16")];
+            tensor<int32, [4]> var_694_begin_0 = const()[name = string("op_694_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_694_end_0 = const()[name = string("op_694_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_694_end_mask_0 = const()[name = string("op_694_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_694_cast_fp16 = slice_by_index(begin = var_694_begin_0, end = var_694_end_0, end_mask = var_694_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_694_cast_fp16")];
+            tensor<int32, [4]> var_706_begin_0 = const()[name = string("op_706_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_706_end_0 = const()[name = string("op_706_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_706_end_mask_0 = const()[name = string("op_706_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_706_cast_fp16 = slice_by_index(begin = var_706_begin_0, end = var_706_end_0, end_mask = var_706_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_706_cast_fp16")];
+            tensor<int32, [4]> var_710_begin_0 = const()[name = string("op_710_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_710_end_0 = const()[name = string("op_710_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_710_end_mask_0 = const()[name = string("op_710_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_710_cast_fp16 = slice_by_index(begin = var_710_begin_0, end = var_710_end_0, end_mask = var_710_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_710_cast_fp16")];
+            tensor<int32, [4]> var_722_begin_0 = const()[name = string("op_722_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_722_end_0 = const()[name = string("op_722_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_722_end_mask_0 = const()[name = string("op_722_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_722_cast_fp16 = slice_by_index(begin = var_722_begin_0, end = var_722_end_0, end_mask = var_722_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_722_cast_fp16")];
+            tensor<int32, [4]> var_726_begin_0 = const()[name = string("op_726_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_726_end_0 = const()[name = string("op_726_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_726_end_mask_0 = const()[name = string("op_726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_726_cast_fp16 = slice_by_index(begin = var_726_begin_0, end = var_726_end_0, end_mask = var_726_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_726_cast_fp16")];
+            tensor<int32, [4]> var_738_begin_0 = const()[name = string("op_738_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_738_end_0 = const()[name = string("op_738_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_738_end_mask_0 = const()[name = string("op_738_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_738_cast_fp16 = slice_by_index(begin = var_738_begin_0, end = var_738_end_0, end_mask = var_738_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_738_cast_fp16")];
+            tensor<int32, [4]> var_742_begin_0 = const()[name = string("op_742_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_742_end_0 = const()[name = string("op_742_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_742_end_mask_0 = const()[name = string("op_742_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_742_cast_fp16 = slice_by_index(begin = var_742_begin_0, end = var_742_end_0, end_mask = var_742_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_742_cast_fp16")];
+            tensor<int32, [4]> var_754_begin_0 = const()[name = string("op_754_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_754_end_0 = const()[name = string("op_754_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_754_end_mask_0 = const()[name = string("op_754_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_754_cast_fp16 = slice_by_index(begin = var_754_begin_0, end = var_754_end_0, end_mask = var_754_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_754_cast_fp16")];
+            tensor<int32, [4]> var_758_begin_0 = const()[name = string("op_758_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_758_end_0 = const()[name = string("op_758_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_758_end_mask_0 = const()[name = string("op_758_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_758_cast_fp16 = slice_by_index(begin = var_758_begin_0, end = var_758_end_0, end_mask = var_758_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_758_cast_fp16")];
+            tensor<int32, [4]> var_770_begin_0 = const()[name = string("op_770_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_770_end_0 = const()[name = string("op_770_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_770_end_mask_0 = const()[name = string("op_770_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_770_cast_fp16 = slice_by_index(begin = var_770_begin_0, end = var_770_end_0, end_mask = var_770_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_770_cast_fp16")];
+            tensor<int32, [4]> var_774_begin_0 = const()[name = string("op_774_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_774_end_0 = const()[name = string("op_774_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_774_end_mask_0 = const()[name = string("op_774_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_774_cast_fp16 = slice_by_index(begin = var_774_begin_0, end = var_774_end_0, end_mask = var_774_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_774_cast_fp16")];
+            tensor<int32, [4]> var_786_begin_0 = const()[name = string("op_786_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_786_end_0 = const()[name = string("op_786_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_786_end_mask_0 = const()[name = string("op_786_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_786_cast_fp16 = slice_by_index(begin = var_786_begin_0, end = var_786_end_0, end_mask = var_786_end_mask_0, x = key_heads_5_cast_fp16)[name = string("op_786_cast_fp16")];
+            tensor<int32, [4]> var_790_begin_0 = const()[name = string("op_790_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_790_end_0 = const()[name = string("op_790_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_790_end_mask_0 = const()[name = string("op_790_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_790_cast_fp16 = slice_by_index(begin = var_790_begin_0, end = var_790_end_0, end_mask = var_790_end_mask_0, x = value_heads_5_cast_fp16)[name = string("op_790_cast_fp16")];
+            bool key_heads_7_interleave_0 = const()[name = string("key_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_7_cast_fp16 = concat(axis = var_516, interleave = key_heads_7_interleave_0, values = (var_674_cast_fp16, var_674_cast_fp16, var_690_cast_fp16, var_690_cast_fp16, var_706_cast_fp16, var_706_cast_fp16, var_722_cast_fp16, var_722_cast_fp16, var_738_cast_fp16, var_738_cast_fp16, var_754_cast_fp16, var_754_cast_fp16, var_770_cast_fp16, var_770_cast_fp16, var_786_cast_fp16, var_786_cast_fp16))[name = string("key_heads_7_cast_fp16")];
+            bool value_heads_7_interleave_0 = const()[name = string("value_heads_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_7_cast_fp16 = concat(axis = var_516, interleave = value_heads_7_interleave_0, values = (var_678_cast_fp16, var_678_cast_fp16, var_694_cast_fp16, var_694_cast_fp16, var_710_cast_fp16, var_710_cast_fp16, var_726_cast_fp16, var_726_cast_fp16, var_742_cast_fp16, var_742_cast_fp16, var_758_cast_fp16, var_758_cast_fp16, var_774_cast_fp16, var_774_cast_fp16, var_790_cast_fp16, var_790_cast_fp16))[name = string("value_heads_7_cast_fp16")];
+            fp16 var_813_to_fp16 = const()[name = string("op_813_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_814_cast_fp16 = mul(x = mh_q_9_cast_fp16, y = var_813_to_fp16)[name = string("op_814_cast_fp16")];
+            bool mh_w_5_transpose_x_0 = const()[name = string("mh_w_5_transpose_x_0"), val = bool(true)];
+            bool mh_w_5_transpose_y_0 = const()[name = string("mh_w_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_5_cast_fp16 = matmul(transpose_x = mh_w_5_transpose_x_0, transpose_y = mh_w_5_transpose_y_0, x = var_814_cast_fp16, y = key_heads_7_cast_fp16)[name = string("mh_w_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_7_cast_fp16 = add(x = mh_w_5_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_7_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_826_cast_fp16 = softmax(axis = var_498, x = mh_w_7_cast_fp16)[name = string("op_826_cast_fp16")];
+            bool attn_3_transpose_x_0 = const()[name = string("attn_3_transpose_x_0"), val = bool(false)];
+            bool attn_3_transpose_y_0 = const()[name = string("attn_3_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = value_heads_7_cast_fp16, y = var_826_cast_fp16)[name = string("attn_3_cast_fp16")];
+            tensor<int32, [4]> var_831 = const()[name = string("op_831"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_9_cast_fp16 = reshape(shape = var_831, x = attn_3_cast_fp16)[name = string("input_9_cast_fp16")];
+            string obj_19_pad_type_0 = const()[name = string("obj_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_19_strides_0 = const()[name = string("obj_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_19_pad_0 = const()[name = string("obj_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_19_dilations_0 = const()[name = string("obj_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_19_groups_0 = const()[name = string("obj_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_1_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22051520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24148736))))[name = string("layers_1_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_19_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_19_dilations_0, groups = obj_19_groups_0, pad = obj_19_pad_0, pad_type = obj_19_pad_type_0, strides = obj_19_strides_0, weight = layers_1_self_attn_o_proj_weight_to_fp16_palettized, x = input_9_cast_fp16)[name = string("obj_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_9_cast_fp16, y = obj_19_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_849_to_fp16 = const()[name = string("op_849_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_850_cast_fp16 = add(x = variance_15_cast_fp16, y = var_849_to_fp16)[name = string("op_850_cast_fp16")];
+            fp32 var_851_epsilon_0 = const()[name = string("op_851_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_851_cast_fp16 = rsqrt(epsilon = var_851_epsilon_0, x = var_850_cast_fp16)[name = string("op_851_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_851_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24149312)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_11_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_17_cast_fp16)[name = string("input_11_cast_fp16")];
+            string input_13_pad_type_0 = const()[name = string("input_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_13_strides_0 = const()[name = string("input_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_13_pad_0 = const()[name = string("input_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_13_dilations_0 = const()[name = string("input_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_13_groups_0 = const()[name = string("input_13_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24151424))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27297216))))[name = string("layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_13_cast_fp16 = conv(dilations = input_13_dilations_0, groups = input_13_groups_0, pad = input_13_pad_0, pad_type = input_13_pad_type_0, strides = input_13_strides_0, weight = layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_865_cast_fp16 = silu(x = input_13_cast_fp16)[name = string("op_865_cast_fp16")];
+            string var_871_pad_type_0 = const()[name = string("op_871_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_871_strides_0 = const()[name = string("op_871_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_871_pad_0 = const()[name = string("op_871_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_871_dilations_0 = const()[name = string("op_871_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_871_groups_0 = const()[name = string("op_871_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27297792))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30443584))))[name = string("layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_871_cast_fp16 = conv(dilations = var_871_dilations_0, groups = var_871_groups_0, pad = var_871_pad_0, pad_type = var_871_pad_type_0, strides = var_871_strides_0, weight = layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_11_cast_fp16)[name = string("op_871_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_15_cast_fp16 = mul(x = var_865_cast_fp16, y = var_871_cast_fp16)[name = string("input_15_cast_fp16")];
+            string hidden_states_19_pad_type_0 = const()[name = string("hidden_states_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_19_strides_0 = const()[name = string("hidden_states_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_19_pad_0 = const()[name = string("hidden_states_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_19_dilations_0 = const()[name = string("hidden_states_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_19_groups_0 = const()[name = string("hidden_states_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_1_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30444160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33589952))))[name = string("layers_1_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_19_cast_fp16 = conv(dilations = hidden_states_19_dilations_0, groups = hidden_states_19_groups_0, pad = hidden_states_19_pad_0, pad_type = hidden_states_19_pad_type_0, strides = hidden_states_19_strides_0, weight = layers_1_mlp_down_proj_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_17_cast_fp16 = add(x = inputs_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            int32 var_885 = const()[name = string("op_885"), val = int32(3)];
+            int32 var_895 = const()[name = string("op_895"), val = int32(-2)];
+            int32 var_903 = const()[name = string("op_903"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_915_to_fp16 = const()[name = string("op_915_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_916_cast_fp16 = add(x = variance_17_cast_fp16, y = var_915_to_fp16)[name = string("op_916_cast_fp16")];
+            fp32 var_917_epsilon_0 = const()[name = string("op_917_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_917_cast_fp16 = rsqrt(epsilon = var_917_epsilon_0, x = var_916_cast_fp16)[name = string("op_917_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_917_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33590528)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_21_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_21_cast_fp16)[name = string("obj_21_cast_fp16")];
+            string query_13_pad_type_0 = const()[name = string("query_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_13_strides_0 = const()[name = string("query_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_13_pad_0 = const()[name = string("query_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_13_dilations_0 = const()[name = string("query_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_13_groups_0 = const()[name = string("query_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33592640))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35689856))))[name = string("layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_13_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_13_dilations_0, groups = query_13_groups_0, pad = query_13_pad_0, pad_type = query_13_pad_type_0, strides = query_13_strides_0, weight = layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("query_13_cast_fp16")];
+            string current_key_9_pad_type_0 = const()[name = string("current_key_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_9_strides_0 = const()[name = string("current_key_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_9_pad_0 = const()[name = string("current_key_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_9_dilations_0 = const()[name = string("current_key_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_9_groups_0 = const()[name = string("current_key_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35690432))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36739072))))[name = string("layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = conv(dilations = current_key_9_dilations_0, groups = current_key_9_groups_0, pad = current_key_9_pad_0, pad_type = current_key_9_pad_type_0, strides = current_key_9_strides_0, weight = layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            string current_value_5_pad_type_0 = const()[name = string("current_value_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_5_strides_0 = const()[name = string("current_value_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_5_pad_0 = const()[name = string("current_value_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_5_dilations_0 = const()[name = string("current_value_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_5_groups_0 = const()[name = string("current_value_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36739648))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37788288))))[name = string("layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_5_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_5_dilations_0, groups = current_value_5_groups_0, pad = current_value_5_pad_0, pad_type = current_value_5_pad_type_0, strides = current_value_5_strides_0, weight = layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = obj_21_cast_fp16)[name = string("current_value_5_cast_fp16")];
+            tensor<int32, [4]> var_954 = const()[name = string("op_954"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_19_cast_fp16 = reshape(shape = var_954, x = query_13_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_960_to_fp16 = const()[name = string("op_960_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_961_cast_fp16 = add(x = variance_19_cast_fp16, y = var_960_to_fp16)[name = string("op_961_cast_fp16")];
+            fp32 var_962_epsilon_0 = const()[name = string("op_962_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_962_cast_fp16 = rsqrt(epsilon = var_962_epsilon_0, x = var_961_cast_fp16)[name = string("op_962_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_962_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37788864)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_5_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_23_cast_fp16)[name = string("query_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_970 = const()[name = string("op_970"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_21_cast_fp16 = reshape(shape = var_970, x = current_key_9_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_976_to_fp16 = const()[name = string("op_976_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_977_cast_fp16 = add(x = variance_21_cast_fp16, y = var_976_to_fp16)[name = string("op_977_cast_fp16")];
+            fp32 var_978_epsilon_0 = const()[name = string("op_978_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_978_cast_fp16 = rsqrt(epsilon = var_978_epsilon_0, x = var_977_cast_fp16)[name = string("op_978_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_978_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37789184)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_5_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_25_cast_fp16)[name = string("current_key_normed_5_cast_fp16")];
+            tensor<int32, [4]> var_996 = const()[name = string("op_996"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_13_cast_fp16 = reshape(shape = var_996, x = query_normed_5_cast_fp16)[name = string("mh_q_13_cast_fp16")];
+            tensor<int32, [4]> var_998 = const()[name = string("op_998"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_9_cast_fp16 = reshape(shape = var_998, x = current_key_normed_5_cast_fp16)[name = string("mh_k_9_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1002_cast_fp16 = mul(x = mh_q_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1002_cast_fp16")];
+            tensor<int32, [4]> var_1007_begin_0 = const()[name = string("op_1007_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1007_end_0 = const()[name = string("op_1007_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1007_end_mask_0 = const()[name = string("op_1007_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1007_cast_fp16 = slice_by_index(begin = var_1007_begin_0, end = var_1007_end_0, end_mask = var_1007_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1007_cast_fp16")];
+            tensor<int32, [4]> var_1013_begin_0 = const()[name = string("op_1013_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1013_end_0 = const()[name = string("op_1013_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1013_end_mask_0 = const()[name = string("op_1013_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1013_cast_fp16 = slice_by_index(begin = var_1013_begin_0, end = var_1013_end_0, end_mask = var_1013_end_mask_0, x = mh_q_13_cast_fp16)[name = string("op_1013_cast_fp16")];
+            fp16 const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1015_cast_fp16 = mul(x = var_1013_cast_fp16, y = const_63_promoted_to_fp16)[name = string("op_1015_cast_fp16")];
+            bool var_1017_interleave_0 = const()[name = string("op_1017_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1017_cast_fp16 = concat(axis = var_895, interleave = var_1017_interleave_0, values = (var_1015_cast_fp16, var_1007_cast_fp16))[name = string("op_1017_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1018_cast_fp16 = mul(x = var_1017_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1018_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_15_cast_fp16 = add(x = var_1002_cast_fp16, y = var_1018_cast_fp16)[name = string("mh_q_15_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1020_cast_fp16 = mul(x = mh_k_9_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1020_cast_fp16")];
+            tensor<int32, [4]> var_1025_begin_0 = const()[name = string("op_1025_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1025_end_0 = const()[name = string("op_1025_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1025_end_mask_0 = const()[name = string("op_1025_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1025_cast_fp16 = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1025_cast_fp16")];
+            tensor<int32, [4]> var_1031_begin_0 = const()[name = string("op_1031_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1031_end_0 = const()[name = string("op_1031_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1031_end_mask_0 = const()[name = string("op_1031_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1031_cast_fp16 = slice_by_index(begin = var_1031_begin_0, end = var_1031_end_0, end_mask = var_1031_end_mask_0, x = mh_k_9_cast_fp16)[name = string("op_1031_cast_fp16")];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1033_cast_fp16 = mul(x = var_1031_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_1033_cast_fp16")];
+            bool var_1035_interleave_0 = const()[name = string("op_1035_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1035_cast_fp16 = concat(axis = var_895, interleave = var_1035_interleave_0, values = (var_1033_cast_fp16, var_1025_cast_fp16))[name = string("op_1035_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1036_cast_fp16 = mul(x = var_1035_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1036_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_11_cast_fp16 = add(x = var_1020_cast_fp16, y = var_1036_cast_fp16)[name = string("mh_k_11_cast_fp16")];
+            tensor<int32, [4]> var_1040 = const()[name = string("op_1040"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1040, x = mh_k_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1047_cast_fp16 = mul(x = var_96_cast_fp16_2, y = var_272_cast_fp16)[name = string("op_1047_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1048_cast_fp16 = mul(x = current_key_11_cast_fp16, y = var_270_cast_fp16)[name = string("op_1048_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_15_cast_fp16 = add(x = var_1047_cast_fp16, y = var_1048_cast_fp16)[name = string("key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1051_cast_fp16 = mul(x = var_104_cast_fp16_2, y = var_272_cast_fp16)[name = string("op_1051_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1052_cast_fp16 = mul(x = current_value_5_cast_fp16, y = var_270_cast_fp16)[name = string("op_1052_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_9_cast_fp16 = add(x = var_1051_cast_fp16, y = var_1052_cast_fp16)[name = string("value_9_cast_fp16")];
+            tensor<int32, [4]> var_1056 = const()[name = string("op_1056"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_9_cast_fp16 = reshape(shape = var_1056, x = key_15_cast_fp16)[name = string("key_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1058 = const()[name = string("op_1058"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_9_cast_fp16 = reshape(shape = var_1058, x = value_9_cast_fp16)[name = string("value_heads_9_cast_fp16")];
+            tensor<int32, [4]> var_1061_begin_0 = const()[name = string("op_1061_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1061_end_0 = const()[name = string("op_1061_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1061_end_mask_0 = const()[name = string("op_1061_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1061_cast_fp16 = slice_by_index(begin = var_1061_begin_0, end = var_1061_end_0, end_mask = var_1061_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1061_cast_fp16")];
+            tensor<int32, [4]> var_1065_begin_0 = const()[name = string("op_1065_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1065_end_0 = const()[name = string("op_1065_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1065_end_mask_0 = const()[name = string("op_1065_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1065_cast_fp16 = slice_by_index(begin = var_1065_begin_0, end = var_1065_end_0, end_mask = var_1065_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1065_cast_fp16")];
+            tensor<int32, [4]> var_1077_begin_0 = const()[name = string("op_1077_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1077_end_0 = const()[name = string("op_1077_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1077_end_mask_0 = const()[name = string("op_1077_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1077_cast_fp16 = slice_by_index(begin = var_1077_begin_0, end = var_1077_end_0, end_mask = var_1077_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1077_cast_fp16")];
+            tensor<int32, [4]> var_1081_begin_0 = const()[name = string("op_1081_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1081_end_0 = const()[name = string("op_1081_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1081_end_mask_0 = const()[name = string("op_1081_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1081_cast_fp16 = slice_by_index(begin = var_1081_begin_0, end = var_1081_end_0, end_mask = var_1081_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1081_cast_fp16")];
+            tensor<int32, [4]> var_1093_begin_0 = const()[name = string("op_1093_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1093_end_0 = const()[name = string("op_1093_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1093_end_mask_0 = const()[name = string("op_1093_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1093_cast_fp16 = slice_by_index(begin = var_1093_begin_0, end = var_1093_end_0, end_mask = var_1093_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1093_cast_fp16")];
+            tensor<int32, [4]> var_1097_begin_0 = const()[name = string("op_1097_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1097_end_0 = const()[name = string("op_1097_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1097_end_mask_0 = const()[name = string("op_1097_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1097_cast_fp16 = slice_by_index(begin = var_1097_begin_0, end = var_1097_end_0, end_mask = var_1097_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1097_cast_fp16")];
+            tensor<int32, [4]> var_1109_begin_0 = const()[name = string("op_1109_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1109_end_0 = const()[name = string("op_1109_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1109_end_mask_0 = const()[name = string("op_1109_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1109_cast_fp16 = slice_by_index(begin = var_1109_begin_0, end = var_1109_end_0, end_mask = var_1109_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1109_cast_fp16")];
+            tensor<int32, [4]> var_1113_begin_0 = const()[name = string("op_1113_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1113_end_0 = const()[name = string("op_1113_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1113_end_mask_0 = const()[name = string("op_1113_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1113_cast_fp16 = slice_by_index(begin = var_1113_begin_0, end = var_1113_end_0, end_mask = var_1113_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1113_cast_fp16")];
+            tensor<int32, [4]> var_1125_begin_0 = const()[name = string("op_1125_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1125_end_0 = const()[name = string("op_1125_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1125_end_mask_0 = const()[name = string("op_1125_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1125_cast_fp16 = slice_by_index(begin = var_1125_begin_0, end = var_1125_end_0, end_mask = var_1125_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1125_cast_fp16")];
+            tensor<int32, [4]> var_1129_begin_0 = const()[name = string("op_1129_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1129_end_0 = const()[name = string("op_1129_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1129_end_mask_0 = const()[name = string("op_1129_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1129_cast_fp16 = slice_by_index(begin = var_1129_begin_0, end = var_1129_end_0, end_mask = var_1129_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1129_cast_fp16")];
+            tensor<int32, [4]> var_1141_begin_0 = const()[name = string("op_1141_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1141_end_0 = const()[name = string("op_1141_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1141_end_mask_0 = const()[name = string("op_1141_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1141_cast_fp16 = slice_by_index(begin = var_1141_begin_0, end = var_1141_end_0, end_mask = var_1141_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1141_cast_fp16")];
+            tensor<int32, [4]> var_1145_begin_0 = const()[name = string("op_1145_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1145_end_0 = const()[name = string("op_1145_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1145_end_mask_0 = const()[name = string("op_1145_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1145_cast_fp16 = slice_by_index(begin = var_1145_begin_0, end = var_1145_end_0, end_mask = var_1145_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1145_cast_fp16")];
+            tensor<int32, [4]> var_1157_begin_0 = const()[name = string("op_1157_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1157_end_0 = const()[name = string("op_1157_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1157_end_mask_0 = const()[name = string("op_1157_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1157_cast_fp16 = slice_by_index(begin = var_1157_begin_0, end = var_1157_end_0, end_mask = var_1157_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1157_cast_fp16")];
+            tensor<int32, [4]> var_1161_begin_0 = const()[name = string("op_1161_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1161_end_0 = const()[name = string("op_1161_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1161_end_mask_0 = const()[name = string("op_1161_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1161_cast_fp16 = slice_by_index(begin = var_1161_begin_0, end = var_1161_end_0, end_mask = var_1161_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1161_cast_fp16")];
+            tensor<int32, [4]> var_1173_begin_0 = const()[name = string("op_1173_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1173_end_0 = const()[name = string("op_1173_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1173_end_mask_0 = const()[name = string("op_1173_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1173_cast_fp16 = slice_by_index(begin = var_1173_begin_0, end = var_1173_end_0, end_mask = var_1173_end_mask_0, x = key_heads_9_cast_fp16)[name = string("op_1173_cast_fp16")];
+            tensor<int32, [4]> var_1177_begin_0 = const()[name = string("op_1177_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1177_end_0 = const()[name = string("op_1177_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1177_end_mask_0 = const()[name = string("op_1177_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1177_cast_fp16 = slice_by_index(begin = var_1177_begin_0, end = var_1177_end_0, end_mask = var_1177_end_mask_0, x = value_heads_9_cast_fp16)[name = string("op_1177_cast_fp16")];
+            bool key_heads_11_interleave_0 = const()[name = string("key_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_11_cast_fp16 = concat(axis = var_903, interleave = key_heads_11_interleave_0, values = (var_1061_cast_fp16, var_1061_cast_fp16, var_1077_cast_fp16, var_1077_cast_fp16, var_1093_cast_fp16, var_1093_cast_fp16, var_1109_cast_fp16, var_1109_cast_fp16, var_1125_cast_fp16, var_1125_cast_fp16, var_1141_cast_fp16, var_1141_cast_fp16, var_1157_cast_fp16, var_1157_cast_fp16, var_1173_cast_fp16, var_1173_cast_fp16))[name = string("key_heads_11_cast_fp16")];
+            bool value_heads_11_interleave_0 = const()[name = string("value_heads_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_11_cast_fp16 = concat(axis = var_903, interleave = value_heads_11_interleave_0, values = (var_1065_cast_fp16, var_1065_cast_fp16, var_1081_cast_fp16, var_1081_cast_fp16, var_1097_cast_fp16, var_1097_cast_fp16, var_1113_cast_fp16, var_1113_cast_fp16, var_1129_cast_fp16, var_1129_cast_fp16, var_1145_cast_fp16, var_1145_cast_fp16, var_1161_cast_fp16, var_1161_cast_fp16, var_1177_cast_fp16, var_1177_cast_fp16))[name = string("value_heads_11_cast_fp16")];
+            fp16 var_1200_to_fp16 = const()[name = string("op_1200_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1201_cast_fp16 = mul(x = mh_q_15_cast_fp16, y = var_1200_to_fp16)[name = string("op_1201_cast_fp16")];
+            bool mh_w_9_transpose_x_0 = const()[name = string("mh_w_9_transpose_x_0"), val = bool(true)];
+            bool mh_w_9_transpose_y_0 = const()[name = string("mh_w_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_9_cast_fp16 = matmul(transpose_x = mh_w_9_transpose_x_0, transpose_y = mh_w_9_transpose_y_0, x = var_1201_cast_fp16, y = key_heads_11_cast_fp16)[name = string("mh_w_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_11_cast_fp16 = add(x = mh_w_9_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_11_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1213_cast_fp16 = softmax(axis = var_885, x = mh_w_11_cast_fp16)[name = string("op_1213_cast_fp16")];
+            bool attn_5_transpose_x_0 = const()[name = string("attn_5_transpose_x_0"), val = bool(false)];
+            bool attn_5_transpose_y_0 = const()[name = string("attn_5_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = value_heads_11_cast_fp16, y = var_1213_cast_fp16)[name = string("attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1218 = const()[name = string("op_1218"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_17_cast_fp16 = reshape(shape = var_1218, x = attn_5_cast_fp16)[name = string("input_17_cast_fp16")];
+            string obj_27_pad_type_0 = const()[name = string("obj_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_27_strides_0 = const()[name = string("obj_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_27_pad_0 = const()[name = string("obj_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_27_dilations_0 = const()[name = string("obj_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_27_groups_0 = const()[name = string("obj_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_2_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37789504))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39886720))))[name = string("layers_2_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_27_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_27_dilations_0, groups = obj_27_groups_0, pad = obj_27_pad_0, pad_type = obj_27_pad_type_0, strides = obj_27_strides_0, weight = layers_2_self_attn_o_proj_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = string("obj_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_17_cast_fp16, y = obj_27_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1236_to_fp16 = const()[name = string("op_1236_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1237_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1236_to_fp16)[name = string("op_1237_cast_fp16")];
+            fp32 var_1238_epsilon_0 = const()[name = string("op_1238_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1238_cast_fp16 = rsqrt(epsilon = var_1238_epsilon_0, x = var_1237_cast_fp16)[name = string("op_1238_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1238_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39887296)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_19_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_19_cast_fp16")];
+            string input_21_pad_type_0 = const()[name = string("input_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_21_strides_0 = const()[name = string("input_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_21_pad_0 = const()[name = string("input_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_21_dilations_0 = const()[name = string("input_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_21_groups_0 = const()[name = string("input_21_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39889408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43035200))))[name = string("layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_21_cast_fp16 = conv(dilations = input_21_dilations_0, groups = input_21_groups_0, pad = input_21_pad_0, pad_type = input_21_pad_type_0, strides = input_21_strides_0, weight = layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("input_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1252_cast_fp16 = silu(x = input_21_cast_fp16)[name = string("op_1252_cast_fp16")];
+            string var_1258_pad_type_0 = const()[name = string("op_1258_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1258_strides_0 = const()[name = string("op_1258_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1258_pad_0 = const()[name = string("op_1258_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1258_dilations_0 = const()[name = string("op_1258_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1258_groups_0 = const()[name = string("op_1258_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43035776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46181568))))[name = string("layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1258_cast_fp16 = conv(dilations = var_1258_dilations_0, groups = var_1258_groups_0, pad = var_1258_pad_0, pad_type = var_1258_pad_type_0, strides = var_1258_strides_0, weight = layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_19_cast_fp16)[name = string("op_1258_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_23_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1258_cast_fp16)[name = string("input_23_cast_fp16")];
+            string hidden_states_29_pad_type_0 = const()[name = string("hidden_states_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_29_strides_0 = const()[name = string("hidden_states_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_29_pad_0 = const()[name = string("hidden_states_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_29_dilations_0 = const()[name = string("hidden_states_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_29_groups_0 = const()[name = string("hidden_states_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_2_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46182144))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49327936))))[name = string("layers_2_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_29_cast_fp16 = conv(dilations = hidden_states_29_dilations_0, groups = hidden_states_29_groups_0, pad = hidden_states_29_pad_0, pad_type = hidden_states_29_pad_type_0, strides = hidden_states_29_strides_0, weight = layers_2_mlp_down_proj_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_25_cast_fp16 = add(x = inputs_23_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            int32 var_1272 = const()[name = string("op_1272"), val = int32(3)];
+            int32 var_1282 = const()[name = string("op_1282"), val = int32(-2)];
+            int32 var_1290 = const()[name = string("op_1290"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1302_to_fp16 = const()[name = string("op_1302_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1303_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1302_to_fp16)[name = string("op_1303_cast_fp16")];
+            fp32 var_1304_epsilon_0 = const()[name = string("op_1304_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1304_cast_fp16 = rsqrt(epsilon = var_1304_epsilon_0, x = var_1303_cast_fp16)[name = string("op_1304_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1304_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49328512)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_29_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_31_cast_fp16)[name = string("obj_29_cast_fp16")];
+            string query_19_pad_type_0 = const()[name = string("query_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_19_strides_0 = const()[name = string("query_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_19_pad_0 = const()[name = string("query_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_19_dilations_0 = const()[name = string("query_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_19_groups_0 = const()[name = string("query_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49330624))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51427840))))[name = string("layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_19_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_19_dilations_0, groups = query_19_groups_0, pad = query_19_pad_0, pad_type = query_19_pad_type_0, strides = query_19_strides_0, weight = layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("query_19_cast_fp16")];
+            string current_key_13_pad_type_0 = const()[name = string("current_key_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_13_strides_0 = const()[name = string("current_key_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_13_pad_0 = const()[name = string("current_key_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_13_dilations_0 = const()[name = string("current_key_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_13_groups_0 = const()[name = string("current_key_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51428416))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(52477056))))[name = string("layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = conv(dilations = current_key_13_dilations_0, groups = current_key_13_groups_0, pad = current_key_13_pad_0, pad_type = current_key_13_pad_type_0, strides = current_key_13_strides_0, weight = layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            string current_value_7_pad_type_0 = const()[name = string("current_value_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_7_strides_0 = const()[name = string("current_value_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_7_pad_0 = const()[name = string("current_value_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_7_dilations_0 = const()[name = string("current_value_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_7_groups_0 = const()[name = string("current_value_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(52477632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53526272))))[name = string("layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_7_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_7_dilations_0, groups = current_value_7_groups_0, pad = current_value_7_pad_0, pad_type = current_value_7_pad_type_0, strides = current_value_7_strides_0, weight = layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = obj_29_cast_fp16)[name = string("current_value_7_cast_fp16")];
+            tensor<int32, [4]> var_1341 = const()[name = string("op_1341"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_27_cast_fp16 = reshape(shape = var_1341, x = query_19_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1347_to_fp16 = const()[name = string("op_1347_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1348_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1347_to_fp16)[name = string("op_1348_cast_fp16")];
+            fp32 var_1349_epsilon_0 = const()[name = string("op_1349_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1349_cast_fp16 = rsqrt(epsilon = var_1349_epsilon_0, x = var_1348_cast_fp16)[name = string("op_1349_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_33_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1349_cast_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53526848)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_7_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_33_cast_fp16)[name = string("query_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1357 = const()[name = string("op_1357"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_29_cast_fp16 = reshape(shape = var_1357, x = current_key_13_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1363_to_fp16 = const()[name = string("op_1363_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1364_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1363_to_fp16)[name = string("op_1364_cast_fp16")];
+            fp32 var_1365_epsilon_0 = const()[name = string("op_1365_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1365_cast_fp16 = rsqrt(epsilon = var_1365_epsilon_0, x = var_1364_cast_fp16)[name = string("op_1365_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_35_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1365_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53527168)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_7_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_35_cast_fp16)[name = string("current_key_normed_7_cast_fp16")];
+            tensor<int32, [4]> var_1383 = const()[name = string("op_1383"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_19_cast_fp16 = reshape(shape = var_1383, x = query_normed_7_cast_fp16)[name = string("mh_q_19_cast_fp16")];
+            tensor<int32, [4]> var_1385 = const()[name = string("op_1385"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_13_cast_fp16 = reshape(shape = var_1385, x = current_key_normed_7_cast_fp16)[name = string("mh_k_13_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1389_cast_fp16 = mul(x = mh_q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1389_cast_fp16")];
+            tensor<int32, [4]> var_1394_begin_0 = const()[name = string("op_1394_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1394_end_0 = const()[name = string("op_1394_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1394_end_mask_0 = const()[name = string("op_1394_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1394_cast_fp16 = slice_by_index(begin = var_1394_begin_0, end = var_1394_end_0, end_mask = var_1394_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1394_cast_fp16")];
+            tensor<int32, [4]> var_1400_begin_0 = const()[name = string("op_1400_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1400_end_0 = const()[name = string("op_1400_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1400_end_mask_0 = const()[name = string("op_1400_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1400_cast_fp16 = slice_by_index(begin = var_1400_begin_0, end = var_1400_end_0, end_mask = var_1400_end_mask_0, x = mh_q_19_cast_fp16)[name = string("op_1400_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1402_cast_fp16 = mul(x = var_1400_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1402_cast_fp16")];
+            bool var_1404_interleave_0 = const()[name = string("op_1404_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1404_cast_fp16 = concat(axis = var_1282, interleave = var_1404_interleave_0, values = (var_1402_cast_fp16, var_1394_cast_fp16))[name = string("op_1404_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1405_cast_fp16 = mul(x = var_1404_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1405_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_21_cast_fp16 = add(x = var_1389_cast_fp16, y = var_1405_cast_fp16)[name = string("mh_q_21_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1407_cast_fp16 = mul(x = mh_k_13_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1407_cast_fp16")];
+            tensor<int32, [4]> var_1412_begin_0 = const()[name = string("op_1412_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1412_end_0 = const()[name = string("op_1412_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1412_end_mask_0 = const()[name = string("op_1412_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1412_cast_fp16 = slice_by_index(begin = var_1412_begin_0, end = var_1412_end_0, end_mask = var_1412_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1412_cast_fp16")];
+            tensor<int32, [4]> var_1418_begin_0 = const()[name = string("op_1418_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1418_end_0 = const()[name = string("op_1418_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1418_end_mask_0 = const()[name = string("op_1418_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1418_cast_fp16 = slice_by_index(begin = var_1418_begin_0, end = var_1418_end_0, end_mask = var_1418_end_mask_0, x = mh_k_13_cast_fp16)[name = string("op_1418_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1420_cast_fp16 = mul(x = var_1418_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1420_cast_fp16")];
+            bool var_1422_interleave_0 = const()[name = string("op_1422_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1422_cast_fp16 = concat(axis = var_1282, interleave = var_1422_interleave_0, values = (var_1420_cast_fp16, var_1412_cast_fp16))[name = string("op_1422_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1423_cast_fp16 = mul(x = var_1422_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1423_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_15_cast_fp16 = add(x = var_1407_cast_fp16, y = var_1423_cast_fp16)[name = string("mh_k_15_cast_fp16")];
+            tensor<int32, [4]> var_1427 = const()[name = string("op_1427"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_15_cast_fp16 = reshape(shape = var_1427, x = mh_k_15_cast_fp16)[name = string("current_key_15_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1434_cast_fp16 = mul(x = var_96_cast_fp16_3, y = var_272_cast_fp16)[name = string("op_1434_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1435_cast_fp16 = mul(x = current_key_15_cast_fp16, y = var_270_cast_fp16)[name = string("op_1435_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_21_cast_fp16 = add(x = var_1434_cast_fp16, y = var_1435_cast_fp16)[name = string("key_21_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1438_cast_fp16 = mul(x = var_104_cast_fp16_3, y = var_272_cast_fp16)[name = string("op_1438_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1439_cast_fp16 = mul(x = current_value_7_cast_fp16, y = var_270_cast_fp16)[name = string("op_1439_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_13_cast_fp16 = add(x = var_1438_cast_fp16, y = var_1439_cast_fp16)[name = string("value_13_cast_fp16")];
+            tensor<int32, [4]> var_1443 = const()[name = string("op_1443"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_13_cast_fp16 = reshape(shape = var_1443, x = key_21_cast_fp16)[name = string("key_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1445 = const()[name = string("op_1445"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_13_cast_fp16 = reshape(shape = var_1445, x = value_13_cast_fp16)[name = string("value_heads_13_cast_fp16")];
+            tensor<int32, [4]> var_1448_begin_0 = const()[name = string("op_1448_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1448_end_0 = const()[name = string("op_1448_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1448_end_mask_0 = const()[name = string("op_1448_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1448_cast_fp16 = slice_by_index(begin = var_1448_begin_0, end = var_1448_end_0, end_mask = var_1448_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1448_cast_fp16")];
+            tensor<int32, [4]> var_1452_begin_0 = const()[name = string("op_1452_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1452_end_0 = const()[name = string("op_1452_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1452_end_mask_0 = const()[name = string("op_1452_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1452_cast_fp16 = slice_by_index(begin = var_1452_begin_0, end = var_1452_end_0, end_mask = var_1452_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1452_cast_fp16")];
+            tensor<int32, [4]> var_1464_begin_0 = const()[name = string("op_1464_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1464_end_0 = const()[name = string("op_1464_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1464_end_mask_0 = const()[name = string("op_1464_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1464_cast_fp16 = slice_by_index(begin = var_1464_begin_0, end = var_1464_end_0, end_mask = var_1464_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1464_cast_fp16")];
+            tensor<int32, [4]> var_1468_begin_0 = const()[name = string("op_1468_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1468_end_0 = const()[name = string("op_1468_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1468_end_mask_0 = const()[name = string("op_1468_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1468_cast_fp16 = slice_by_index(begin = var_1468_begin_0, end = var_1468_end_0, end_mask = var_1468_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1468_cast_fp16")];
+            tensor<int32, [4]> var_1480_begin_0 = const()[name = string("op_1480_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1480_end_0 = const()[name = string("op_1480_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1480_end_mask_0 = const()[name = string("op_1480_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1480_cast_fp16 = slice_by_index(begin = var_1480_begin_0, end = var_1480_end_0, end_mask = var_1480_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1480_cast_fp16")];
+            tensor<int32, [4]> var_1484_begin_0 = const()[name = string("op_1484_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1484_end_0 = const()[name = string("op_1484_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1484_end_mask_0 = const()[name = string("op_1484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1484_cast_fp16 = slice_by_index(begin = var_1484_begin_0, end = var_1484_end_0, end_mask = var_1484_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1484_cast_fp16")];
+            tensor<int32, [4]> var_1496_begin_0 = const()[name = string("op_1496_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1496_end_0 = const()[name = string("op_1496_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1496_end_mask_0 = const()[name = string("op_1496_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1496_cast_fp16 = slice_by_index(begin = var_1496_begin_0, end = var_1496_end_0, end_mask = var_1496_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1496_cast_fp16")];
+            tensor<int32, [4]> var_1500_begin_0 = const()[name = string("op_1500_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1500_end_0 = const()[name = string("op_1500_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1500_end_mask_0 = const()[name = string("op_1500_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1500_cast_fp16 = slice_by_index(begin = var_1500_begin_0, end = var_1500_end_0, end_mask = var_1500_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1500_cast_fp16")];
+            tensor<int32, [4]> var_1512_begin_0 = const()[name = string("op_1512_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1512_end_0 = const()[name = string("op_1512_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1512_end_mask_0 = const()[name = string("op_1512_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1512_cast_fp16 = slice_by_index(begin = var_1512_begin_0, end = var_1512_end_0, end_mask = var_1512_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1512_cast_fp16")];
+            tensor<int32, [4]> var_1516_begin_0 = const()[name = string("op_1516_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1516_end_0 = const()[name = string("op_1516_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1516_end_mask_0 = const()[name = string("op_1516_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1516_cast_fp16 = slice_by_index(begin = var_1516_begin_0, end = var_1516_end_0, end_mask = var_1516_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1516_cast_fp16")];
+            tensor<int32, [4]> var_1528_begin_0 = const()[name = string("op_1528_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1528_end_0 = const()[name = string("op_1528_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1528_end_mask_0 = const()[name = string("op_1528_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1528_cast_fp16 = slice_by_index(begin = var_1528_begin_0, end = var_1528_end_0, end_mask = var_1528_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1528_cast_fp16")];
+            tensor<int32, [4]> var_1532_begin_0 = const()[name = string("op_1532_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1532_end_0 = const()[name = string("op_1532_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1532_end_mask_0 = const()[name = string("op_1532_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1532_cast_fp16 = slice_by_index(begin = var_1532_begin_0, end = var_1532_end_0, end_mask = var_1532_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1532_cast_fp16")];
+            tensor<int32, [4]> var_1544_begin_0 = const()[name = string("op_1544_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1544_end_0 = const()[name = string("op_1544_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1544_end_mask_0 = const()[name = string("op_1544_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1544_cast_fp16 = slice_by_index(begin = var_1544_begin_0, end = var_1544_end_0, end_mask = var_1544_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1544_cast_fp16")];
+            tensor<int32, [4]> var_1548_begin_0 = const()[name = string("op_1548_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1548_end_0 = const()[name = string("op_1548_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1548_end_mask_0 = const()[name = string("op_1548_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1548_cast_fp16 = slice_by_index(begin = var_1548_begin_0, end = var_1548_end_0, end_mask = var_1548_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1548_cast_fp16")];
+            tensor<int32, [4]> var_1560_begin_0 = const()[name = string("op_1560_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1560_end_0 = const()[name = string("op_1560_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1560_end_mask_0 = const()[name = string("op_1560_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1560_cast_fp16 = slice_by_index(begin = var_1560_begin_0, end = var_1560_end_0, end_mask = var_1560_end_mask_0, x = key_heads_13_cast_fp16)[name = string("op_1560_cast_fp16")];
+            tensor<int32, [4]> var_1564_begin_0 = const()[name = string("op_1564_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1564_end_0 = const()[name = string("op_1564_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1564_end_mask_0 = const()[name = string("op_1564_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1564_cast_fp16 = slice_by_index(begin = var_1564_begin_0, end = var_1564_end_0, end_mask = var_1564_end_mask_0, x = value_heads_13_cast_fp16)[name = string("op_1564_cast_fp16")];
+            bool key_heads_15_interleave_0 = const()[name = string("key_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_15_cast_fp16 = concat(axis = var_1290, interleave = key_heads_15_interleave_0, values = (var_1448_cast_fp16, var_1448_cast_fp16, var_1464_cast_fp16, var_1464_cast_fp16, var_1480_cast_fp16, var_1480_cast_fp16, var_1496_cast_fp16, var_1496_cast_fp16, var_1512_cast_fp16, var_1512_cast_fp16, var_1528_cast_fp16, var_1528_cast_fp16, var_1544_cast_fp16, var_1544_cast_fp16, var_1560_cast_fp16, var_1560_cast_fp16))[name = string("key_heads_15_cast_fp16")];
+            bool value_heads_15_interleave_0 = const()[name = string("value_heads_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_15_cast_fp16 = concat(axis = var_1290, interleave = value_heads_15_interleave_0, values = (var_1452_cast_fp16, var_1452_cast_fp16, var_1468_cast_fp16, var_1468_cast_fp16, var_1484_cast_fp16, var_1484_cast_fp16, var_1500_cast_fp16, var_1500_cast_fp16, var_1516_cast_fp16, var_1516_cast_fp16, var_1532_cast_fp16, var_1532_cast_fp16, var_1548_cast_fp16, var_1548_cast_fp16, var_1564_cast_fp16, var_1564_cast_fp16))[name = string("value_heads_15_cast_fp16")];
+            fp16 var_1587_to_fp16 = const()[name = string("op_1587_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1588_cast_fp16 = mul(x = mh_q_21_cast_fp16, y = var_1587_to_fp16)[name = string("op_1588_cast_fp16")];
+            bool mh_w_13_transpose_x_0 = const()[name = string("mh_w_13_transpose_x_0"), val = bool(true)];
+            bool mh_w_13_transpose_y_0 = const()[name = string("mh_w_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_13_cast_fp16 = matmul(transpose_x = mh_w_13_transpose_x_0, transpose_y = mh_w_13_transpose_y_0, x = var_1588_cast_fp16, y = key_heads_15_cast_fp16)[name = string("mh_w_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_15_cast_fp16 = add(x = mh_w_13_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_15_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1600_cast_fp16 = softmax(axis = var_1272, x = mh_w_15_cast_fp16)[name = string("op_1600_cast_fp16")];
+            bool attn_7_transpose_x_0 = const()[name = string("attn_7_transpose_x_0"), val = bool(false)];
+            bool attn_7_transpose_y_0 = const()[name = string("attn_7_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_7_cast_fp16 = matmul(transpose_x = attn_7_transpose_x_0, transpose_y = attn_7_transpose_y_0, x = value_heads_15_cast_fp16, y = var_1600_cast_fp16)[name = string("attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1605 = const()[name = string("op_1605"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_25_cast_fp16 = reshape(shape = var_1605, x = attn_7_cast_fp16)[name = string("input_25_cast_fp16")];
+            string obj_35_pad_type_0 = const()[name = string("obj_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_35_strides_0 = const()[name = string("obj_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_35_pad_0 = const()[name = string("obj_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_35_dilations_0 = const()[name = string("obj_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_35_groups_0 = const()[name = string("obj_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_3_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53527488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55624704))))[name = string("layers_3_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_35_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_35_dilations_0, groups = obj_35_groups_0, pad = obj_35_pad_0, pad_type = obj_35_pad_type_0, strides = obj_35_strides_0, weight = layers_3_self_attn_o_proj_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = string("obj_35_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_25_cast_fp16, y = obj_35_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_1623_to_fp16 = const()[name = string("op_1623_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1624_cast_fp16 = add(x = variance_31_cast_fp16, y = var_1623_to_fp16)[name = string("op_1624_cast_fp16")];
+            fp32 var_1625_epsilon_0 = const()[name = string("op_1625_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1625_cast_fp16 = rsqrt(epsilon = var_1625_epsilon_0, x = var_1624_cast_fp16)[name = string("op_1625_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_37_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_1625_cast_fp16)[name = string("hidden_states_37_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55625280)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_27_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_37_cast_fp16)[name = string("input_27_cast_fp16")];
+            string input_29_pad_type_0 = const()[name = string("input_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_29_strides_0 = const()[name = string("input_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_29_pad_0 = const()[name = string("input_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_29_dilations_0 = const()[name = string("input_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_29_groups_0 = const()[name = string("input_29_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55627392))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58773184))))[name = string("layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_29_cast_fp16 = conv(dilations = input_29_dilations_0, groups = input_29_groups_0, pad = input_29_pad_0, pad_type = input_29_pad_type_0, strides = input_29_strides_0, weight = layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("input_29_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1639_cast_fp16 = silu(x = input_29_cast_fp16)[name = string("op_1639_cast_fp16")];
+            string var_1645_pad_type_0 = const()[name = string("op_1645_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1645_strides_0 = const()[name = string("op_1645_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1645_pad_0 = const()[name = string("op_1645_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1645_dilations_0 = const()[name = string("op_1645_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1645_groups_0 = const()[name = string("op_1645_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58773760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61919552))))[name = string("layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_1645_cast_fp16 = conv(dilations = var_1645_dilations_0, groups = var_1645_groups_0, pad = var_1645_pad_0, pad_type = var_1645_pad_type_0, strides = var_1645_strides_0, weight = layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_27_cast_fp16)[name = string("op_1645_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_31_cast_fp16 = mul(x = var_1639_cast_fp16, y = var_1645_cast_fp16)[name = string("input_31_cast_fp16")];
+            string hidden_states_39_pad_type_0 = const()[name = string("hidden_states_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_39_strides_0 = const()[name = string("hidden_states_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_39_pad_0 = const()[name = string("hidden_states_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_39_dilations_0 = const()[name = string("hidden_states_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_39_groups_0 = const()[name = string("hidden_states_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_3_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61920128))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65065920))))[name = string("layers_3_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_39_cast_fp16 = conv(dilations = hidden_states_39_dilations_0, groups = hidden_states_39_groups_0, pad = hidden_states_39_pad_0, pad_type = hidden_states_39_pad_type_0, strides = hidden_states_39_strides_0, weight = layers_3_mlp_down_proj_weight_to_fp16_palettized, x = input_31_cast_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_33_cast_fp16 = add(x = inputs_31_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("inputs_33_cast_fp16")];
+            int32 var_1659 = const()[name = string("op_1659"), val = int32(3)];
+            int32 var_1669 = const()[name = string("op_1669"), val = int32(-2)];
+            int32 var_1677 = const()[name = string("op_1677"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_33_cast_fp16 = mul(x = inputs_33_cast_fp16, y = inputs_33_cast_fp16)[name = string("inputs_sq_33_cast_fp16")];
+            tensor<int32, [1]> variance_33_axes_0 = const()[name = string("variance_33_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_33_keep_dims_0 = const()[name = string("variance_33_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_33_cast_fp16 = reduce_mean(axes = variance_33_axes_0, keep_dims = variance_33_keep_dims_0, x = inputs_sq_33_cast_fp16)[name = string("variance_33_cast_fp16")];
+            fp16 var_1689_to_fp16 = const()[name = string("op_1689_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_1690_cast_fp16 = add(x = variance_33_cast_fp16, y = var_1689_to_fp16)[name = string("op_1690_cast_fp16")];
+            fp32 var_1691_epsilon_0 = const()[name = string("op_1691_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1691_cast_fp16 = rsqrt(epsilon = var_1691_epsilon_0, x = var_1690_cast_fp16)[name = string("op_1691_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_41_cast_fp16 = mul(x = inputs_33_cast_fp16, y = var_1691_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_33_to_fp16 = const()[name = string("w_33_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65066496)))];
+            tensor<fp16, [1, 1024, 1, 1]> obj_37_cast_fp16 = mul(x = w_33_to_fp16, y = hidden_states_41_cast_fp16)[name = string("obj_37_cast_fp16")];
+            string query_25_pad_type_0 = const()[name = string("query_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> query_25_strides_0 = const()[name = string("query_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> query_25_pad_0 = const()[name = string("query_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> query_25_dilations_0 = const()[name = string("query_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 query_25_groups_0 = const()[name = string("query_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65068608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67165824))))[name = string("layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> query_25_cast_fp16 = conv(bias = layers_0_self_attn_q_proj_bias_to_fp16, dilations = query_25_dilations_0, groups = query_25_groups_0, pad = query_25_pad_0, pad_type = query_25_pad_type_0, strides = query_25_strides_0, weight = layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("query_25_cast_fp16")];
+            string current_key_17_pad_type_0 = const()[name = string("current_key_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_key_17_strides_0 = const()[name = string("current_key_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_key_17_pad_0 = const()[name = string("current_key_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_key_17_dilations_0 = const()[name = string("current_key_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_key_17_groups_0 = const()[name = string("current_key_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67166400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68215040))))[name = string("layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_17_cast_fp16 = conv(dilations = current_key_17_dilations_0, groups = current_key_17_groups_0, pad = current_key_17_pad_0, pad_type = current_key_17_pad_type_0, strides = current_key_17_strides_0, weight = layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_key_17_cast_fp16")];
+            string current_value_pad_type_0 = const()[name = string("current_value_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> current_value_strides_0 = const()[name = string("current_value_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> current_value_pad_0 = const()[name = string("current_value_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> current_value_dilations_0 = const()[name = string("current_value_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 current_value_groups_0 = const()[name = string("current_value_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68215616))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69264256))))[name = string("layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> current_value_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = current_value_dilations_0, groups = current_value_groups_0, pad = current_value_pad_0, pad_type = current_value_pad_type_0, strides = current_value_strides_0, weight = layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = obj_37_cast_fp16)[name = string("current_value_cast_fp16")];
+            tensor<int32, [4]> var_1728 = const()[name = string("op_1728"), val = tensor<int32, [4]>([16, 128, 1, 1])];
+            tensor<fp16, [16, 128, 1, 1]> inputs_35_cast_fp16 = reshape(shape = var_1728, x = query_25_cast_fp16)[name = string("inputs_35_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> inputs_sq_35_cast_fp16 = mul(x = inputs_35_cast_fp16, y = inputs_35_cast_fp16)[name = string("inputs_sq_35_cast_fp16")];
+            tensor<int32, [1]> variance_35_axes_0 = const()[name = string("variance_35_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_35_keep_dims_0 = const()[name = string("variance_35_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [16, 1, 1, 1]> variance_35_cast_fp16 = reduce_mean(axes = variance_35_axes_0, keep_dims = variance_35_keep_dims_0, x = inputs_sq_35_cast_fp16)[name = string("variance_35_cast_fp16")];
+            fp16 var_1734_to_fp16 = const()[name = string("op_1734_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [16, 1, 1, 1]> var_1735_cast_fp16 = add(x = variance_35_cast_fp16, y = var_1734_to_fp16)[name = string("op_1735_cast_fp16")];
+            fp32 var_1736_epsilon_0 = const()[name = string("op_1736_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [16, 1, 1, 1]> var_1736_cast_fp16 = rsqrt(epsilon = var_1736_epsilon_0, x = var_1735_cast_fp16)[name = string("op_1736_cast_fp16")];
+            tensor<fp16, [16, 128, 1, 1]> hidden_states_43_cast_fp16 = mul(x = inputs_35_cast_fp16, y = var_1736_cast_fp16)[name = string("hidden_states_43_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_35_to_fp16 = const()[name = string("w_35_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69264832)))];
+            tensor<fp16, [16, 128, 1, 1]> query_normed_cast_fp16 = mul(x = w_35_to_fp16, y = hidden_states_43_cast_fp16)[name = string("query_normed_cast_fp16")];
+            tensor<int32, [4]> var_1744 = const()[name = string("op_1744"), val = tensor<int32, [4]>([8, 128, 1, 1])];
+            tensor<fp16, [8, 128, 1, 1]> inputs_37_cast_fp16 = reshape(shape = var_1744, x = current_key_17_cast_fp16)[name = string("inputs_37_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> inputs_sq_37_cast_fp16 = mul(x = inputs_37_cast_fp16, y = inputs_37_cast_fp16)[name = string("inputs_sq_37_cast_fp16")];
+            tensor<int32, [1]> variance_37_axes_0 = const()[name = string("variance_37_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_37_keep_dims_0 = const()[name = string("variance_37_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [8, 1, 1, 1]> variance_37_cast_fp16 = reduce_mean(axes = variance_37_axes_0, keep_dims = variance_37_keep_dims_0, x = inputs_sq_37_cast_fp16)[name = string("variance_37_cast_fp16")];
+            fp16 var_1750_to_fp16 = const()[name = string("op_1750_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [8, 1, 1, 1]> var_1751_cast_fp16 = add(x = variance_37_cast_fp16, y = var_1750_to_fp16)[name = string("op_1751_cast_fp16")];
+            fp32 var_1752_epsilon_0 = const()[name = string("op_1752_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [8, 1, 1, 1]> var_1752_cast_fp16 = rsqrt(epsilon = var_1752_epsilon_0, x = var_1751_cast_fp16)[name = string("op_1752_cast_fp16")];
+            tensor<fp16, [8, 128, 1, 1]> hidden_states_45_cast_fp16 = mul(x = inputs_37_cast_fp16, y = var_1752_cast_fp16)[name = string("hidden_states_45_cast_fp16")];
+            tensor<fp16, [1, 128, 1, 1]> w_37_to_fp16 = const()[name = string("w_37_to_fp16"), val = tensor<fp16, [1, 128, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69265152)))];
+            tensor<fp16, [8, 128, 1, 1]> current_key_normed_cast_fp16 = mul(x = w_37_to_fp16, y = hidden_states_45_cast_fp16)[name = string("current_key_normed_cast_fp16")];
+            tensor<int32, [4]> var_1770 = const()[name = string("op_1770"), val = tensor<int32, [4]>([1, 16, 128, -1])];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_25_cast_fp16 = reshape(shape = var_1770, x = query_normed_cast_fp16)[name = string("mh_q_25_cast_fp16")];
+            tensor<int32, [4]> var_1772 = const()[name = string("op_1772"), val = tensor<int32, [4]>([1, 8, 128, -1])];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_17_cast_fp16 = reshape(shape = var_1772, x = current_key_normed_cast_fp16)[name = string("mh_k_17_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1776_cast_fp16 = mul(x = mh_q_25_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1776_cast_fp16")];
+            tensor<int32, [4]> var_1781_begin_0 = const()[name = string("op_1781_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1781_end_0 = const()[name = string("op_1781_end_0"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<bool, [4]> var_1781_end_mask_0 = const()[name = string("op_1781_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1781_cast_fp16 = slice_by_index(begin = var_1781_begin_0, end = var_1781_end_0, end_mask = var_1781_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1781_cast_fp16")];
+            tensor<int32, [4]> var_1787_begin_0 = const()[name = string("op_1787_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1787_end_0 = const()[name = string("op_1787_end_0"), val = tensor<int32, [4]>([1, 16, 128, 1])];
+            tensor<bool, [4]> var_1787_end_mask_0 = const()[name = string("op_1787_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 16, 64, 1]> var_1787_cast_fp16 = slice_by_index(begin = var_1787_begin_0, end = var_1787_end_0, end_mask = var_1787_end_mask_0, x = mh_q_25_cast_fp16)[name = string("op_1787_cast_fp16")];
+            fp16 const_109_promoted_to_fp16 = const()[name = string("const_109_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 16, 64, 1]> var_1789_cast_fp16 = mul(x = var_1787_cast_fp16, y = const_109_promoted_to_fp16)[name = string("op_1789_cast_fp16")];
+            bool var_1791_interleave_0 = const()[name = string("op_1791_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 1]> var_1791_cast_fp16 = concat(axis = var_1669, interleave = var_1791_interleave_0, values = (var_1789_cast_fp16, var_1781_cast_fp16))[name = string("op_1791_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> var_1792_cast_fp16 = mul(x = var_1791_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1792_cast_fp16")];
+            tensor<fp16, [1, 16, 128, 1]> mh_q_27_cast_fp16 = add(x = var_1776_cast_fp16, y = var_1792_cast_fp16)[name = string("mh_q_27_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1794_cast_fp16 = mul(x = mh_k_17_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1794_cast_fp16")];
+            tensor<int32, [4]> var_1799_begin_0 = const()[name = string("op_1799_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1799_end_0 = const()[name = string("op_1799_end_0"), val = tensor<int32, [4]>([1, 8, 64, 1])];
+            tensor<bool, [4]> var_1799_end_mask_0 = const()[name = string("op_1799_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1799_cast_fp16 = slice_by_index(begin = var_1799_begin_0, end = var_1799_end_0, end_mask = var_1799_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1799_cast_fp16")];
+            tensor<int32, [4]> var_1805_begin_0 = const()[name = string("op_1805_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1805_end_0 = const()[name = string("op_1805_end_0"), val = tensor<int32, [4]>([1, 8, 128, 1])];
+            tensor<bool, [4]> var_1805_end_mask_0 = const()[name = string("op_1805_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 1]> var_1805_cast_fp16 = slice_by_index(begin = var_1805_begin_0, end = var_1805_end_0, end_mask = var_1805_end_mask_0, x = mh_k_17_cast_fp16)[name = string("op_1805_cast_fp16")];
+            fp16 const_112_promoted_to_fp16 = const()[name = string("const_112_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 1]> var_1807_cast_fp16 = mul(x = var_1805_cast_fp16, y = const_112_promoted_to_fp16)[name = string("op_1807_cast_fp16")];
+            bool var_1809_interleave_0 = const()[name = string("op_1809_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 128, 1]> var_1809_cast_fp16 = concat(axis = var_1669, interleave = var_1809_interleave_0, values = (var_1807_cast_fp16, var_1799_cast_fp16))[name = string("op_1809_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> var_1810_cast_fp16 = mul(x = var_1809_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1810_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 1]> mh_k_cast_fp16 = add(x = var_1794_cast_fp16, y = var_1810_cast_fp16)[name = string("mh_k_cast_fp16")];
+            tensor<int32, [4]> var_1814 = const()[name = string("op_1814"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_1814, x = mh_k_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1821_cast_fp16 = mul(x = var_96_cast_fp16_4, y = var_272_cast_fp16)[name = string("op_1821_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1822_cast_fp16 = mul(x = current_key_cast_fp16, y = var_270_cast_fp16)[name = string("op_1822_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> key_27_cast_fp16 = add(x = var_1821_cast_fp16, y = var_1822_cast_fp16)[name = string("key_27_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1825_cast_fp16 = mul(x = var_104_cast_fp16_4, y = var_272_cast_fp16)[name = string("op_1825_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> var_1826_cast_fp16 = mul(x = current_value_cast_fp16, y = var_270_cast_fp16)[name = string("op_1826_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 16]> value_17_cast_fp16 = add(x = var_1825_cast_fp16, y = var_1826_cast_fp16)[name = string("value_17_cast_fp16")];
+            tensor<int32, [4]> var_1830 = const()[name = string("op_1830"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> key_heads_17_cast_fp16 = reshape(shape = var_1830, x = key_27_cast_fp16)[name = string("key_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1832 = const()[name = string("op_1832"), val = tensor<int32, [4]>([1, 8, 128, 16])];
+            tensor<fp16, [1, 8, 128, 16]> value_heads_17_cast_fp16 = reshape(shape = var_1832, x = value_17_cast_fp16)[name = string("value_heads_17_cast_fp16")];
+            tensor<int32, [4]> var_1835_begin_0 = const()[name = string("op_1835_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1835_end_0 = const()[name = string("op_1835_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1835_end_mask_0 = const()[name = string("op_1835_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1835_cast_fp16 = slice_by_index(begin = var_1835_begin_0, end = var_1835_end_0, end_mask = var_1835_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1835_cast_fp16")];
+            tensor<int32, [4]> var_1839_begin_0 = const()[name = string("op_1839_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1839_end_0 = const()[name = string("op_1839_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1839_end_mask_0 = const()[name = string("op_1839_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1839_cast_fp16 = slice_by_index(begin = var_1839_begin_0, end = var_1839_end_0, end_mask = var_1839_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1839_cast_fp16")];
+            tensor<int32, [4]> var_1851_begin_0 = const()[name = string("op_1851_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1851_end_0 = const()[name = string("op_1851_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1851_end_mask_0 = const()[name = string("op_1851_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1851_cast_fp16 = slice_by_index(begin = var_1851_begin_0, end = var_1851_end_0, end_mask = var_1851_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1851_cast_fp16")];
+            tensor<int32, [4]> var_1855_begin_0 = const()[name = string("op_1855_begin_0"), val = tensor<int32, [4]>([0, 1, 0, 0])];
+            tensor<int32, [4]> var_1855_end_0 = const()[name = string("op_1855_end_0"), val = tensor<int32, [4]>([1, 2, 128, 16])];
+            tensor<bool, [4]> var_1855_end_mask_0 = const()[name = string("op_1855_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1855_cast_fp16 = slice_by_index(begin = var_1855_begin_0, end = var_1855_end_0, end_mask = var_1855_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1855_cast_fp16")];
+            tensor<int32, [4]> var_1867_begin_0 = const()[name = string("op_1867_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1867_end_0 = const()[name = string("op_1867_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1867_end_mask_0 = const()[name = string("op_1867_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1867_cast_fp16 = slice_by_index(begin = var_1867_begin_0, end = var_1867_end_0, end_mask = var_1867_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1867_cast_fp16")];
+            tensor<int32, [4]> var_1871_begin_0 = const()[name = string("op_1871_begin_0"), val = tensor<int32, [4]>([0, 2, 0, 0])];
+            tensor<int32, [4]> var_1871_end_0 = const()[name = string("op_1871_end_0"), val = tensor<int32, [4]>([1, 3, 128, 16])];
+            tensor<bool, [4]> var_1871_end_mask_0 = const()[name = string("op_1871_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1871_cast_fp16 = slice_by_index(begin = var_1871_begin_0, end = var_1871_end_0, end_mask = var_1871_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1871_cast_fp16")];
+            tensor<int32, [4]> var_1883_begin_0 = const()[name = string("op_1883_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1883_end_0 = const()[name = string("op_1883_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1883_end_mask_0 = const()[name = string("op_1883_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1883_cast_fp16 = slice_by_index(begin = var_1883_begin_0, end = var_1883_end_0, end_mask = var_1883_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1883_cast_fp16")];
+            tensor<int32, [4]> var_1887_begin_0 = const()[name = string("op_1887_begin_0"), val = tensor<int32, [4]>([0, 3, 0, 0])];
+            tensor<int32, [4]> var_1887_end_0 = const()[name = string("op_1887_end_0"), val = tensor<int32, [4]>([1, 4, 128, 16])];
+            tensor<bool, [4]> var_1887_end_mask_0 = const()[name = string("op_1887_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1887_cast_fp16 = slice_by_index(begin = var_1887_begin_0, end = var_1887_end_0, end_mask = var_1887_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1887_cast_fp16")];
+            tensor<int32, [4]> var_1899_begin_0 = const()[name = string("op_1899_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1899_end_0 = const()[name = string("op_1899_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1899_end_mask_0 = const()[name = string("op_1899_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1899_cast_fp16 = slice_by_index(begin = var_1899_begin_0, end = var_1899_end_0, end_mask = var_1899_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1899_cast_fp16")];
+            tensor<int32, [4]> var_1903_begin_0 = const()[name = string("op_1903_begin_0"), val = tensor<int32, [4]>([0, 4, 0, 0])];
+            tensor<int32, [4]> var_1903_end_0 = const()[name = string("op_1903_end_0"), val = tensor<int32, [4]>([1, 5, 128, 16])];
+            tensor<bool, [4]> var_1903_end_mask_0 = const()[name = string("op_1903_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1903_cast_fp16 = slice_by_index(begin = var_1903_begin_0, end = var_1903_end_0, end_mask = var_1903_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1903_cast_fp16")];
+            tensor<int32, [4]> var_1915_begin_0 = const()[name = string("op_1915_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1915_end_0 = const()[name = string("op_1915_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1915_end_mask_0 = const()[name = string("op_1915_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1915_cast_fp16 = slice_by_index(begin = var_1915_begin_0, end = var_1915_end_0, end_mask = var_1915_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1915_cast_fp16")];
+            tensor<int32, [4]> var_1919_begin_0 = const()[name = string("op_1919_begin_0"), val = tensor<int32, [4]>([0, 5, 0, 0])];
+            tensor<int32, [4]> var_1919_end_0 = const()[name = string("op_1919_end_0"), val = tensor<int32, [4]>([1, 6, 128, 16])];
+            tensor<bool, [4]> var_1919_end_mask_0 = const()[name = string("op_1919_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1919_cast_fp16 = slice_by_index(begin = var_1919_begin_0, end = var_1919_end_0, end_mask = var_1919_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1919_cast_fp16")];
+            tensor<int32, [4]> var_1931_begin_0 = const()[name = string("op_1931_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1931_end_0 = const()[name = string("op_1931_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1931_end_mask_0 = const()[name = string("op_1931_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1931_cast_fp16 = slice_by_index(begin = var_1931_begin_0, end = var_1931_end_0, end_mask = var_1931_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1931_cast_fp16")];
+            tensor<int32, [4]> var_1935_begin_0 = const()[name = string("op_1935_begin_0"), val = tensor<int32, [4]>([0, 6, 0, 0])];
+            tensor<int32, [4]> var_1935_end_0 = const()[name = string("op_1935_end_0"), val = tensor<int32, [4]>([1, 7, 128, 16])];
+            tensor<bool, [4]> var_1935_end_mask_0 = const()[name = string("op_1935_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1935_cast_fp16 = slice_by_index(begin = var_1935_begin_0, end = var_1935_end_0, end_mask = var_1935_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1935_cast_fp16")];
+            tensor<int32, [4]> var_1947_begin_0 = const()[name = string("op_1947_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1947_end_0 = const()[name = string("op_1947_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1947_end_mask_0 = const()[name = string("op_1947_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1947_cast_fp16 = slice_by_index(begin = var_1947_begin_0, end = var_1947_end_0, end_mask = var_1947_end_mask_0, x = key_heads_17_cast_fp16)[name = string("op_1947_cast_fp16")];
+            tensor<int32, [4]> var_1951_begin_0 = const()[name = string("op_1951_begin_0"), val = tensor<int32, [4]>([0, 7, 0, 0])];
+            tensor<int32, [4]> var_1951_end_0 = const()[name = string("op_1951_end_0"), val = tensor<int32, [4]>([1, 1, 128, 16])];
+            tensor<bool, [4]> var_1951_end_mask_0 = const()[name = string("op_1951_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 128, 16]> var_1951_cast_fp16 = slice_by_index(begin = var_1951_begin_0, end = var_1951_end_0, end_mask = var_1951_end_mask_0, x = value_heads_17_cast_fp16)[name = string("op_1951_cast_fp16")];
+            bool key_heads_interleave_0 = const()[name = string("key_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> key_heads_cast_fp16 = concat(axis = var_1677, interleave = key_heads_interleave_0, values = (var_1835_cast_fp16, var_1835_cast_fp16, var_1851_cast_fp16, var_1851_cast_fp16, var_1867_cast_fp16, var_1867_cast_fp16, var_1883_cast_fp16, var_1883_cast_fp16, var_1899_cast_fp16, var_1899_cast_fp16, var_1915_cast_fp16, var_1915_cast_fp16, var_1931_cast_fp16, var_1931_cast_fp16, var_1947_cast_fp16, var_1947_cast_fp16))[name = string("key_heads_cast_fp16")];
+            bool value_heads_interleave_0 = const()[name = string("value_heads_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 128, 16]> value_heads_cast_fp16 = concat(axis = var_1677, interleave = value_heads_interleave_0, values = (var_1839_cast_fp16, var_1839_cast_fp16, var_1855_cast_fp16, var_1855_cast_fp16, var_1871_cast_fp16, var_1871_cast_fp16, var_1887_cast_fp16, var_1887_cast_fp16, var_1903_cast_fp16, var_1903_cast_fp16, var_1919_cast_fp16, var_1919_cast_fp16, var_1935_cast_fp16, var_1935_cast_fp16, var_1951_cast_fp16, var_1951_cast_fp16))[name = string("value_heads_cast_fp16")];
+            fp16 var_1974_to_fp16 = const()[name = string("op_1974_to_fp16"), val = fp16(0x1.6ap-4)];
+            tensor<fp16, [1, 16, 128, 1]> var_1975_cast_fp16 = mul(x = mh_q_27_cast_fp16, y = var_1974_to_fp16)[name = string("op_1975_cast_fp16")];
+            bool mh_w_17_transpose_x_0 = const()[name = string("mh_w_17_transpose_x_0"), val = bool(true)];
+            bool mh_w_17_transpose_y_0 = const()[name = string("mh_w_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_17_cast_fp16 = matmul(transpose_x = mh_w_17_transpose_x_0, transpose_y = mh_w_17_transpose_y_0, x = var_1975_cast_fp16, y = key_heads_cast_fp16)[name = string("mh_w_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> mh_w_cast_fp16 = add(x = mh_w_17_cast_fp16, y = var_436_cast_fp16)[name = string("mh_w_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 16]> var_1987_cast_fp16 = softmax(axis = var_1659, x = mh_w_cast_fp16)[name = string("op_1987_cast_fp16")];
+            bool attn_transpose_x_0 = const()[name = string("attn_transpose_x_0"), val = bool(false)];
+            bool attn_transpose_y_0 = const()[name = string("attn_transpose_y_0"), val = bool(true)];
+            tensor<fp16, [1, 16, 128, 1]> attn_cast_fp16 = matmul(transpose_x = attn_transpose_x_0, transpose_y = attn_transpose_y_0, x = value_heads_cast_fp16, y = var_1987_cast_fp16)[name = string("attn_cast_fp16")];
+            tensor<int32, [4]> var_1992 = const()[name = string("op_1992"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 2048, 1, 1]> input_33_cast_fp16 = reshape(shape = var_1992, x = attn_cast_fp16)[name = string("input_33_cast_fp16")];
+            string obj_pad_type_0 = const()[name = string("obj_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> obj_strides_0 = const()[name = string("obj_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> obj_pad_0 = const()[name = string("obj_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> obj_dilations_0 = const()[name = string("obj_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 obj_groups_0 = const()[name = string("obj_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> layers_4_self_attn_o_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69265472))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71362688))))[name = string("layers_4_self_attn_o_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> obj_cast_fp16 = conv(bias = layers_0_self_attn_v_proj_bias_to_fp16, dilations = obj_dilations_0, groups = obj_groups_0, pad = obj_pad_0, pad_type = obj_pad_type_0, strides = obj_strides_0, weight = layers_4_self_attn_o_proj_weight_to_fp16_palettized, x = input_33_cast_fp16)[name = string("obj_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_39_cast_fp16 = add(x = inputs_33_cast_fp16, y = obj_cast_fp16)[name = string("inputs_39_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_39_cast_fp16 = mul(x = inputs_39_cast_fp16, y = inputs_39_cast_fp16)[name = string("inputs_sq_39_cast_fp16")];
+            tensor<int32, [1]> variance_39_axes_0 = const()[name = string("variance_39_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_39_keep_dims_0 = const()[name = string("variance_39_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_39_cast_fp16 = reduce_mean(axes = variance_39_axes_0, keep_dims = variance_39_keep_dims_0, x = inputs_sq_39_cast_fp16)[name = string("variance_39_cast_fp16")];
+            fp16 var_2010_to_fp16 = const()[name = string("op_2010_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2011_cast_fp16 = add(x = variance_39_cast_fp16, y = var_2010_to_fp16)[name = string("op_2011_cast_fp16")];
+            fp32 var_2012_epsilon_0 = const()[name = string("op_2012_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2012_cast_fp16 = rsqrt(epsilon = var_2012_epsilon_0, x = var_2011_cast_fp16)[name = string("op_2012_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_47_cast_fp16 = mul(x = inputs_39_cast_fp16, y = var_2012_cast_fp16)[name = string("hidden_states_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_39_to_fp16 = const()[name = string("w_39_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71363264)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_35_cast_fp16 = mul(x = w_39_to_fp16, y = hidden_states_47_cast_fp16)[name = string("input_35_cast_fp16")];
+            string input_37_pad_type_0 = const()[name = string("input_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_37_strides_0 = const()[name = string("input_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_37_dilations_0 = const()[name = string("input_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_37_groups_0 = const()[name = string("input_37_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71365376))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74511168))))[name = string("layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> input_37_cast_fp16 = conv(dilations = input_37_dilations_0, groups = input_37_groups_0, pad = input_37_pad_0, pad_type = input_37_pad_type_0, strides = input_37_strides_0, weight = layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("input_37_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2026_cast_fp16 = silu(x = input_37_cast_fp16)[name = string("op_2026_cast_fp16")];
+            string var_2032_pad_type_0 = const()[name = string("op_2032_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2032_strides_0 = const()[name = string("op_2032_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2032_pad_0 = const()[name = string("op_2032_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2032_dilations_0 = const()[name = string("op_2032_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2032_groups_0 = const()[name = string("op_2032_groups_0"), val = int32(1)];
+            tensor<fp16, [3072, 1024, 1, 1]> layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [3072, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74511744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77657536))))[name = string("layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 3072, 1, 1]> var_2032_cast_fp16 = conv(dilations = var_2032_dilations_0, groups = var_2032_groups_0, pad = var_2032_pad_0, pad_type = var_2032_pad_type_0, strides = var_2032_strides_0, weight = layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_2032_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> input_39_cast_fp16 = mul(x = var_2026_cast_fp16, y = var_2032_cast_fp16)[name = string("input_39_cast_fp16")];
+            string hidden_states_49_pad_type_0 = const()[name = string("hidden_states_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> hidden_states_49_strides_0 = const()[name = string("hidden_states_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> hidden_states_49_pad_0 = const()[name = string("hidden_states_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> hidden_states_49_dilations_0 = const()[name = string("hidden_states_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_49_groups_0 = const()[name = string("hidden_states_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 3072, 1, 1]> layers_4_mlp_down_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 3072, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77658112))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80803904))))[name = string("layers_4_mlp_down_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_49_cast_fp16 = conv(dilations = hidden_states_49_dilations_0, groups = hidden_states_49_groups_0, pad = hidden_states_49_pad_0, pad_type = hidden_states_49_pad_type_0, strides = hidden_states_49_strides_0, weight = layers_4_mlp_down_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_cast_fp16 = add(x = inputs_39_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_2053_to_fp16 = const()[name = string("op_2053_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 1, 1]> var_2054_cast_fp16 = add(x = variance_cast_fp16, y = var_2053_to_fp16)[name = string("op_2054_cast_fp16")];
+            fp32 var_2055_epsilon_0 = const()[name = string("op_2055_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2055_cast_fp16 = rsqrt(epsilon = var_2055_epsilon_0, x = var_2054_cast_fp16)[name = string("op_2055_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_2055_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80804480)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_cast_fp16 = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_cast_fp16")];
+            string logits_1_pad_type_0 = const()[name = string("logits_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_1_strides_0 = const()[name = string("logits_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_1_pad_0 = const()[name = string("logits_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_1_dilations_0 = const()[name = string("logits_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_1_groups_0 = const()[name = string("logits_1_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_0_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80806592))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82903808))))[name = string("lm_heads_0_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_1_cast_fp16 = conv(dilations = logits_1_dilations_0, groups = logits_1_groups_0, pad = logits_1_pad_0, pad_type = logits_1_pad_type_0, strides = logits_1_strides_0, weight = lm_heads_0_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_1_cast_fp16")];
+            tensor<int32, [1]> var_2072_axes_0 = const()[name = string("op_2072_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2072_cast_fp16 = squeeze(axes = var_2072_axes_0, x = logits_1_cast_fp16)[name = string("op_2072_cast_fp16")];
+            string logits_3_pad_type_0 = const()[name = string("logits_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_3_strides_0 = const()[name = string("logits_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_3_pad_0 = const()[name = string("logits_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_3_dilations_0 = const()[name = string("logits_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_3_groups_0 = const()[name = string("logits_3_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(82904384))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85001600))))[name = string("lm_heads_1_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_3_cast_fp16 = conv(dilations = logits_3_dilations_0, groups = logits_3_groups_0, pad = logits_3_pad_0, pad_type = logits_3_pad_type_0, strides = logits_3_strides_0, weight = lm_heads_1_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_3_cast_fp16")];
+            tensor<int32, [1]> var_2088_axes_0 = const()[name = string("op_2088_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2088_cast_fp16 = squeeze(axes = var_2088_axes_0, x = logits_3_cast_fp16)[name = string("op_2088_cast_fp16")];
+            string logits_5_pad_type_0 = const()[name = string("logits_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_5_strides_0 = const()[name = string("logits_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_5_pad_0 = const()[name = string("logits_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_5_dilations_0 = const()[name = string("logits_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_5_groups_0 = const()[name = string("logits_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_2_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85002176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87099392))))[name = string("lm_heads_2_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_5_cast_fp16 = conv(dilations = logits_5_dilations_0, groups = logits_5_groups_0, pad = logits_5_pad_0, pad_type = logits_5_pad_type_0, strides = logits_5_strides_0, weight = lm_heads_2_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_5_cast_fp16")];
+            tensor<int32, [1]> var_2104_axes_0 = const()[name = string("op_2104_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2104_cast_fp16 = squeeze(axes = var_2104_axes_0, x = logits_5_cast_fp16)[name = string("op_2104_cast_fp16")];
+            string logits_7_pad_type_0 = const()[name = string("logits_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_7_strides_0 = const()[name = string("logits_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_7_pad_0 = const()[name = string("logits_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_7_dilations_0 = const()[name = string("logits_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_7_groups_0 = const()[name = string("logits_7_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_3_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87099968))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89197184))))[name = string("lm_heads_3_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_7_cast_fp16 = conv(dilations = logits_7_dilations_0, groups = logits_7_groups_0, pad = logits_7_pad_0, pad_type = logits_7_pad_type_0, strides = logits_7_strides_0, weight = lm_heads_3_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_7_cast_fp16")];
+            tensor<int32, [1]> var_2120_axes_0 = const()[name = string("op_2120_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2120_cast_fp16 = squeeze(axes = var_2120_axes_0, x = logits_7_cast_fp16)[name = string("op_2120_cast_fp16")];
+            string logits_9_pad_type_0 = const()[name = string("logits_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_9_strides_0 = const()[name = string("logits_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_9_pad_0 = const()[name = string("logits_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_9_dilations_0 = const()[name = string("logits_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_9_groups_0 = const()[name = string("logits_9_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_4_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89197760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91294976))))[name = string("lm_heads_4_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_9_cast_fp16 = conv(dilations = logits_9_dilations_0, groups = logits_9_groups_0, pad = logits_9_pad_0, pad_type = logits_9_pad_type_0, strides = logits_9_strides_0, weight = lm_heads_4_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_9_cast_fp16")];
+            tensor<int32, [1]> var_2136_axes_0 = const()[name = string("op_2136_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2136_cast_fp16 = squeeze(axes = var_2136_axes_0, x = logits_9_cast_fp16)[name = string("op_2136_cast_fp16")];
+            string logits_11_pad_type_0 = const()[name = string("logits_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_11_strides_0 = const()[name = string("logits_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_11_pad_0 = const()[name = string("logits_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_11_dilations_0 = const()[name = string("logits_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_11_groups_0 = const()[name = string("logits_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_5_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91295552))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93392768))))[name = string("lm_heads_5_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_11_cast_fp16 = conv(dilations = logits_11_dilations_0, groups = logits_11_groups_0, pad = logits_11_pad_0, pad_type = logits_11_pad_type_0, strides = logits_11_strides_0, weight = lm_heads_5_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_11_cast_fp16")];
+            tensor<int32, [1]> var_2152_axes_0 = const()[name = string("op_2152_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2152_cast_fp16 = squeeze(axes = var_2152_axes_0, x = logits_11_cast_fp16)[name = string("op_2152_cast_fp16")];
+            string logits_13_pad_type_0 = const()[name = string("logits_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_13_strides_0 = const()[name = string("logits_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_13_pad_0 = const()[name = string("logits_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_13_dilations_0 = const()[name = string("logits_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_13_groups_0 = const()[name = string("logits_13_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_6_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93393344))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95490560))))[name = string("lm_heads_6_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_13_cast_fp16 = conv(dilations = logits_13_dilations_0, groups = logits_13_groups_0, pad = logits_13_pad_0, pad_type = logits_13_pad_type_0, strides = logits_13_strides_0, weight = lm_heads_6_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_13_cast_fp16")];
+            tensor<int32, [1]> var_2168_axes_0 = const()[name = string("op_2168_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2168_cast_fp16 = squeeze(axes = var_2168_axes_0, x = logits_13_cast_fp16)[name = string("op_2168_cast_fp16")];
+            string logits_15_pad_type_0 = const()[name = string("logits_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_15_strides_0 = const()[name = string("logits_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_15_pad_0 = const()[name = string("logits_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_15_dilations_0 = const()[name = string("logits_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_15_groups_0 = const()[name = string("logits_15_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_7_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95491136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97588352))))[name = string("lm_heads_7_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_15_cast_fp16 = conv(dilations = logits_15_dilations_0, groups = logits_15_groups_0, pad = logits_15_pad_0, pad_type = logits_15_pad_type_0, strides = logits_15_strides_0, weight = lm_heads_7_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_15_cast_fp16")];
+            tensor<int32, [1]> var_2184_axes_0 = const()[name = string("op_2184_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2184_cast_fp16 = squeeze(axes = var_2184_axes_0, x = logits_15_cast_fp16)[name = string("op_2184_cast_fp16")];
+            string logits_17_pad_type_0 = const()[name = string("logits_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_17_strides_0 = const()[name = string("logits_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_17_pad_0 = const()[name = string("logits_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_17_dilations_0 = const()[name = string("logits_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_17_groups_0 = const()[name = string("logits_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_8_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97588928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99686144))))[name = string("lm_heads_8_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_17_cast_fp16 = conv(dilations = logits_17_dilations_0, groups = logits_17_groups_0, pad = logits_17_pad_0, pad_type = logits_17_pad_type_0, strides = logits_17_strides_0, weight = lm_heads_8_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_17_cast_fp16")];
+            tensor<int32, [1]> var_2200_axes_0 = const()[name = string("op_2200_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2200_cast_fp16 = squeeze(axes = var_2200_axes_0, x = logits_17_cast_fp16)[name = string("op_2200_cast_fp16")];
+            string logits_19_pad_type_0 = const()[name = string("logits_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_19_strides_0 = const()[name = string("logits_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_19_pad_0 = const()[name = string("logits_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_19_dilations_0 = const()[name = string("logits_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_19_groups_0 = const()[name = string("logits_19_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_9_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99686720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101783936))))[name = string("lm_heads_9_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_19_cast_fp16 = conv(dilations = logits_19_dilations_0, groups = logits_19_groups_0, pad = logits_19_pad_0, pad_type = logits_19_pad_type_0, strides = logits_19_strides_0, weight = lm_heads_9_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_19_cast_fp16")];
+            tensor<int32, [1]> var_2216_axes_0 = const()[name = string("op_2216_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2216_cast_fp16 = squeeze(axes = var_2216_axes_0, x = logits_19_cast_fp16)[name = string("op_2216_cast_fp16")];
+            string logits_21_pad_type_0 = const()[name = string("logits_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_21_strides_0 = const()[name = string("logits_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_21_pad_0 = const()[name = string("logits_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_21_dilations_0 = const()[name = string("logits_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_21_groups_0 = const()[name = string("logits_21_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_10_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101784512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103881728))))[name = string("lm_heads_10_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_21_cast_fp16 = conv(dilations = logits_21_dilations_0, groups = logits_21_groups_0, pad = logits_21_pad_0, pad_type = logits_21_pad_type_0, strides = logits_21_strides_0, weight = lm_heads_10_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_21_cast_fp16")];
+            tensor<int32, [1]> var_2232_axes_0 = const()[name = string("op_2232_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2232_cast_fp16 = squeeze(axes = var_2232_axes_0, x = logits_21_cast_fp16)[name = string("op_2232_cast_fp16")];
+            string logits_23_pad_type_0 = const()[name = string("logits_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_23_strides_0 = const()[name = string("logits_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_23_pad_0 = const()[name = string("logits_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_23_dilations_0 = const()[name = string("logits_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_23_groups_0 = const()[name = string("logits_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_11_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103882304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105979520))))[name = string("lm_heads_11_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_23_cast_fp16 = conv(dilations = logits_23_dilations_0, groups = logits_23_groups_0, pad = logits_23_pad_0, pad_type = logits_23_pad_type_0, strides = logits_23_strides_0, weight = lm_heads_11_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_23_cast_fp16")];
+            tensor<int32, [1]> var_2248_axes_0 = const()[name = string("op_2248_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2248_cast_fp16 = squeeze(axes = var_2248_axes_0, x = logits_23_cast_fp16)[name = string("op_2248_cast_fp16")];
+            string logits_25_pad_type_0 = const()[name = string("logits_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_25_strides_0 = const()[name = string("logits_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_25_pad_0 = const()[name = string("logits_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_25_dilations_0 = const()[name = string("logits_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_25_groups_0 = const()[name = string("logits_25_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_12_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105980096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108077312))))[name = string("lm_heads_12_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_25_cast_fp16 = conv(dilations = logits_25_dilations_0, groups = logits_25_groups_0, pad = logits_25_pad_0, pad_type = logits_25_pad_type_0, strides = logits_25_strides_0, weight = lm_heads_12_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_25_cast_fp16")];
+            tensor<int32, [1]> var_2264_axes_0 = const()[name = string("op_2264_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2264_cast_fp16 = squeeze(axes = var_2264_axes_0, x = logits_25_cast_fp16)[name = string("op_2264_cast_fp16")];
+            string logits_27_pad_type_0 = const()[name = string("logits_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_27_strides_0 = const()[name = string("logits_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_27_pad_0 = const()[name = string("logits_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_27_dilations_0 = const()[name = string("logits_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_27_groups_0 = const()[name = string("logits_27_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_13_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108077888))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110175104))))[name = string("lm_heads_13_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_27_cast_fp16 = conv(dilations = logits_27_dilations_0, groups = logits_27_groups_0, pad = logits_27_pad_0, pad_type = logits_27_pad_type_0, strides = logits_27_strides_0, weight = lm_heads_13_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_27_cast_fp16")];
+            tensor<int32, [1]> var_2280_axes_0 = const()[name = string("op_2280_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2280_cast_fp16 = squeeze(axes = var_2280_axes_0, x = logits_27_cast_fp16)[name = string("op_2280_cast_fp16")];
+            string logits_29_pad_type_0 = const()[name = string("logits_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> logits_29_strides_0 = const()[name = string("logits_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> logits_29_pad_0 = const()[name = string("logits_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> logits_29_dilations_0 = const()[name = string("logits_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 logits_29_groups_0 = const()[name = string("logits_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 1024, 1, 1]> lm_heads_14_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110175680))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112272896))))[name = string("lm_heads_14_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1, 1]> logits_29_cast_fp16 = conv(dilations = logits_29_dilations_0, groups = logits_29_groups_0, pad = logits_29_pad_0, pad_type = logits_29_pad_type_0, strides = logits_29_strides_0, weight = lm_heads_14_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("logits_29_cast_fp16")];
+            tensor<int32, [1]> var_2296_axes_0 = const()[name = string("op_2296_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1]> var_2296_cast_fp16 = squeeze(axes = var_2296_axes_0, x = logits_29_cast_fp16)[name = string("op_2296_cast_fp16")];
+            bool var_2302_interleave_0 = const()[name = string("op_2302_interleave_0"), val = bool(false)];
+            int32 const_119 = const()[name = string("const_119"), val = int32(2)];
+            tensor<fp16, [1, 2048, 15]> var_2302_cast_fp16 = concat(axis = const_119, interleave = var_2302_interleave_0, values = (var_2072_cast_fp16, var_2088_cast_fp16, var_2104_cast_fp16, var_2120_cast_fp16, var_2136_cast_fp16, var_2152_cast_fp16, var_2168_cast_fp16, var_2184_cast_fp16, var_2200_cast_fp16, var_2216_cast_fp16, var_2232_cast_fp16, var_2248_cast_fp16, var_2264_cast_fp16, var_2280_cast_fp16, var_2296_cast_fp16))[name = string("op_2302_cast_fp16")];
+            int32 var_2304 = const()[name = string("op_2304"), val = int32(1)];
+            bool var_2305_interleave_0 = const()[name = string("op_2305_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> key_cache_updates = concat(axis = var_2304, interleave = var_2305_interleave_0, values = (current_key_3_cast_fp16, current_key_7_cast_fp16, current_key_11_cast_fp16, current_key_15_cast_fp16, current_key_cast_fp16))[name = string("op_2305_cast_fp16")];
+            int32 var_2307 = const()[name = string("op_2307"), val = int32(1)];
+            bool var_2308_interleave_0 = const()[name = string("op_2308_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 5120, 1, 1]> value_cache_updates = concat(axis = var_2307, interleave = var_2308_interleave_0, values = (current_value_1_cast_fp16, current_value_3_cast_fp16, current_value_5_cast_fp16, current_value_7_cast_fp16, current_value_cast_fp16))[name = string("op_2308_cast_fp16")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [1, 15, 2048]> all_logits = transpose(perm = transpose_0_perm_0, x = var_2302_cast_fp16)[name = string("transpose_0")];
+        } -> (all_logits, key_cache_updates, value_cache_updates);
+}
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/weights/weight.bin b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bbc922773a9a5f79771d0ff97ee225732af672b2
--- /dev/null
+++ b/qwen3_tts/multi_code_decoder/12hz-1.7b-customvoice/W8A16/MultiCodeDecoder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c3d559c2413b8bffbd5f1d4e1a3cab6b084e66c369c8d02f8d1e5cee509cc85
+size 112273472
diff --git a/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..caeb08a2f587d99cb61511680330a7a0d42459f3
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18324dfd306ef038fcb0e0a279809ca8f8ea24e92d6bac7c15fbd81df6102220
+size 243
diff --git a/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/coremldata.bin b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..05693ac9510271ad0cdd1f3460bf3524fe7d2779
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08f3c2e364b7d50fd16600725143bdfe80127688d9dd05a9391a90fdcf30d29c
+size 380
diff --git a/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/metadata.json b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e2458e703d7af73d16e2278732f2df14b9b7cb9
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/metadata.json
@@ -0,0 +1,66 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.greaterEqual" : 1,
+      "Ios18.add" : 1,
+      "Ios18.cast" : 3,
+      "Select" : 1,
+      "Ios18.gather" : 1,
+      "Ios18.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-06",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "MultiCodeEmbedder",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/model.mil b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..3940a2d3c86c5e933dda2c05ddeb568a9dbc3773
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/model.mil
@@ -0,0 +1,26 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.8.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios18>(tensor<int32, [1]> input_ids) {
+            int32 embeddings_batch_dims_0 = const()[name = string("embeddings_batch_dims_0"), val = int32(0)];
+            bool embeddings_validate_indices_0 = const()[name = string("embeddings_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [30720, 1024]> codec_embedding_weight_to_fp16 = const()[name = string("codec_embedding_weight_to_fp16"), val = tensor<fp16, [30720, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string input_ids_to_int16_dtype_0 = const()[name = string("input_ids_to_int16_dtype_0"), val = string("int16")];
+            string cast_2_dtype_0 = const()[name = string("cast_2_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> input_ids_to_int16 = cast(dtype = input_ids_to_int16_dtype_0, x = input_ids)[name = string("cast_5")];
+            tensor<int32, [1]> cast_2 = cast(dtype = cast_2_dtype_0, x = input_ids_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_2, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(30720)];
+            tensor<int32, [1]> add_0 = add(x = cast_2, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_2, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 embeddings_cast_fp16_cast_uint16_axis_0 = const()[name = string("embeddings_cast_fp16_cast_uint16_axis_0"), val = int32(0)];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<fp16, [1, 1024]> embeddings_cast_fp16_cast_uint16_cast_uint16 = gather(axis = embeddings_cast_fp16_cast_uint16_axis_0, batch_dims = embeddings_batch_dims_0, indices = select_0_to_int16, validate_indices = embeddings_validate_indices_0, x = codec_embedding_weight_to_fp16)[name = string("embeddings_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> var_18_axes_0 = const()[name = string("op_18_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 1]> var_18_cast_fp16 = expand_dims(axes = var_18_axes_0, x = embeddings_cast_fp16_cast_uint16_cast_uint16)[name = string("op_18_cast_fp16")];
+            tensor<int32, [1]> var_20_axes_0 = const()[name = string("op_20_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 1024, 1, 1]> input_embeds = expand_dims(axes = var_20_axes_0, x = var_18_cast_fp16)[name = string("op_20_cast_fp16")];
+        } -> (input_embeds);
+}
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/weights/weight.bin b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b9f3d33cde2e0df6efbd131929e17a5c783d627c
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-0.6b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f318d23d7db53b299b3aa472844a721d2b6245550fb57558b0d2c5ebbcd3fd82
+size 62914688
diff --git a/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1e7c68afde44cc24aac397be10f6a350881b2451
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59e7994cafd7a43d8c1af61044f330357ba05ee9817d11a3e51bcc3000d598f0
+size 243
diff --git a/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/coremldata.bin b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..37e55680daa5800f8540e0aa647b760e669a39de
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d20121b9a714012ce61f8257ebd047be23c2994a7ef0a1c40225fdbdc72c2dd8
+size 380
diff --git a/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/metadata.json b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cae6752b6825b0f3176a40284a9421badde0709
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/metadata.json
@@ -0,0 +1,66 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2048 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 2048, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.greaterEqual" : 1,
+      "Ios18.add" : 1,
+      "Ios18.cast" : 3,
+      "Select" : 1,
+      "Ios18.gather" : 1,
+      "Ios18.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-12",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "MultiCodeEmbedder",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/model.mil b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..25841be9b52cb3cd6dcd94d9aba70d125f97e0e6
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/model.mil
@@ -0,0 +1,26 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.8.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios18>(tensor<int32, [1]> input_ids) {
+            int32 embeddings_batch_dims_0 = const()[name = string("embeddings_batch_dims_0"), val = int32(0)];
+            bool embeddings_validate_indices_0 = const()[name = string("embeddings_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [30720, 2048]> codec_embedding_weight_to_fp16 = const()[name = string("codec_embedding_weight_to_fp16"), val = tensor<fp16, [30720, 2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string input_ids_to_int16_dtype_0 = const()[name = string("input_ids_to_int16_dtype_0"), val = string("int16")];
+            string cast_2_dtype_0 = const()[name = string("cast_2_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int16, [1]> input_ids_to_int16 = cast(dtype = input_ids_to_int16_dtype_0, x = input_ids)[name = string("cast_5")];
+            tensor<int32, [1]> cast_2 = cast(dtype = cast_2_dtype_0, x = input_ids_to_int16)[name = string("cast_4")];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = cast_2, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(30720)];
+            tensor<int32, [1]> add_0 = add(x = cast_2, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = cast_2, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 embeddings_cast_fp16_cast_uint16_axis_0 = const()[name = string("embeddings_cast_fp16_cast_uint16_axis_0"), val = int32(0)];
+            string select_0_to_int16_dtype_0 = const()[name = string("select_0_to_int16_dtype_0"), val = string("int16")];
+            tensor<int16, [1]> select_0_to_int16 = cast(dtype = select_0_to_int16_dtype_0, x = select_0)[name = string("cast_3")];
+            tensor<fp16, [1, 2048]> embeddings_cast_fp16_cast_uint16_cast_uint16 = gather(axis = embeddings_cast_fp16_cast_uint16_axis_0, batch_dims = embeddings_batch_dims_0, indices = select_0_to_int16, validate_indices = embeddings_validate_indices_0, x = codec_embedding_weight_to_fp16)[name = string("embeddings_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [1]> var_18_axes_0 = const()[name = string("op_18_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2048, 1]> var_18_cast_fp16 = expand_dims(axes = var_18_axes_0, x = embeddings_cast_fp16_cast_uint16_cast_uint16)[name = string("op_18_cast_fp16")];
+            tensor<int32, [1]> var_20_axes_0 = const()[name = string("op_20_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1, 1]> input_embeds = expand_dims(axes = var_20_axes_0, x = var_18_cast_fp16)[name = string("op_20_cast_fp16")];
+        } -> (input_embeds);
+}
\ No newline at end of file
diff --git a/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/weights/weight.bin b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..863f3ab4f942f20325a2f39e8e7e0450b70b53cb
--- /dev/null
+++ b/qwen3_tts/multi_code_embedder/12hz-1.7b-customvoice/W16A16/MultiCodeEmbedder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46f8ac7940915bf46cb15cd16ac06c1cf306044aebf6ade6253831539e61cb88
+size 125829248
diff --git a/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a677ab12269e686b93b07e6bd557c291f3f5f492
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5662e1e7fb3aa28cbbf01e35c3dd4cc11c795ce151f05068ec220092157eb8
+size 243
diff --git a/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/coremldata.bin b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20bd7894e16a48d9dc2a761c13cb4d858be291e1
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:608c8c933a512f362e79193430195402b64781ee57731099225c2e83ab9dbb8d
+size 681
diff --git a/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/metadata.json b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..183d143151889360ad6452b7111c6f8aa47ace3e
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Int32, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 1920)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 1920]",
+        "name" : "audio",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 1]",
+        "name" : "key_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 1]",
+        "name" : "value_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "hidden_context_update",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 26,
+      "Ios18.mul" : 234,
+      "Ios18.softmax" : 8,
+      "Ios18.rsqrt" : 17,
+      "Ios18.matmul" : 16,
+      "Ios16.reduceMean" : 17,
+      "Split" : 3,
+      "Ios18.greaterEqual" : 1,
+      "Select" : 1,
+      "Ios18.sin" : 29,
+      "Tile" : 2,
+      "Ios18.gather" : 18,
+      "Ios18.add" : 132,
+      "Ios18.layerNorm" : 2,
+      "Ios18.reshape" : 56,
+      "Pad" : 17,
+      "Ios18.constexprLutToDense" : 116,
+      "Ios18.conv" : 93,
+      "Ios18.concat" : 19,
+      "Ios18.transpose" : 30,
+      "Ios18.sub" : 1,
+      "Ios18.cast" : 19,
+      "Ios18.silu" : 8,
+      "Ios18.gelu" : 2,
+      "Ios18.clip" : 1,
+      "Ios18.convTranspose" : 6,
+      "Ios18.sliceByIndex" : 39,
+      "Ios18.squeeze" : 18
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-09",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 16 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 16, 1]",
+        "name" : "audio_codes",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 256]",
+        "name" : "key_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 256]",
+        "name" : "value_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "kv_cache_update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "key_padding_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 4)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 4]",
+        "name" : "hidden_context",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "SpeechDecoder_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/model.mil b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..e544d8b0af597c6f6268bc5198865599b0a6b03c
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/model.mil
@@ -0,0 +1,2109 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1, 16, 1]> audio_codes, tensor<int32, [1]> cache_length, tensor<fp16, [1, 1024, 1, 4]> hidden_context, tensor<fp16, [1, 8192, 1, 256]> key_cache, tensor<fp16, [1, 256]> key_padding_mask, tensor<fp16, [1, 256]> kv_cache_update_mask, tensor<fp16, [1, 8192, 1, 256]> value_cache) {
+            int32 var_28 = const()[name = string("op_28"), val = int32(-1)];
+            int32 var_32 = const()[name = string("op_32"), val = int32(1)];
+            tensor<int32, [3]> codes_1_begin_0 = const()[name = string("codes_1_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> codes_1_end_0 = const()[name = string("codes_1_end_0"), val = tensor<int32, [3]>([1, 1, 1])];
+            tensor<bool, [3]> codes_1_end_mask_0 = const()[name = string("codes_1_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<int32, [1, 1, 1]> codes_1 = slice_by_index(begin = codes_1_begin_0, end = codes_1_end_0, end_mask = codes_1_end_mask_0, x = audio_codes)[name = string("codes_1")];
+            tensor<int32, [3]> var_295 = const()[name = string("op_295"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [1]> squeeze_0_axes_0 = const()[name = string("squeeze_0_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_296_to_int16_dtype_0 = const()[name = string("op_296_to_int16_dtype_0"), val = string("int16")];
+            tensor<int32, [1, 1, 1]> var_296 = transpose(perm = var_295, x = codes_1)[name = string("transpose_29")];
+            tensor<int16, [1, 1, 1]> var_296_to_int16 = cast(dtype = var_296_to_int16_dtype_0, x = var_296)[name = string("cast_18")];
+            tensor<int16, [1, 1]> squeeze_0_cast_uint16 = squeeze(axes = squeeze_0_axes_0, x = var_296_to_int16)[name = string("squeeze_0_cast_uint16")];
+            int32 quantized_1_batch_dims_0 = const()[name = string("quantized_1_batch_dims_0"), val = int32(0)];
+            bool quantized_1_validate_indices_0 = const()[name = string("quantized_1_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_1_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(524416))))[name = string("weight_1_to_fp16_palettized")];
+            string cast_216_dtype_0 = const()[name = string("cast_216_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int32, [1, 1]> cast_216 = cast(dtype = cast_216_dtype_0, x = squeeze_0_cast_uint16)[name = string("cast_17")];
+            tensor<bool, [1, 1]> greater_equal_0 = greater_equal(x = cast_216, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(2048)];
+            tensor<int32, [1, 1]> add_0 = add(x = cast_216, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1, 1]> select_0 = select(a = cast_216, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 quantized_1_cast_fp16_cast_uint16_axis_0 = const()[name = string("quantized_1_cast_fp16_cast_uint16_axis_0"), val = int32(0)];
+            string select_0_to_uint16_dtype_0 = const()[name = string("select_0_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1]> select_0_to_uint16 = cast(dtype = select_0_to_uint16_dtype_0, x = select_0)[name = string("cast_16")];
+            tensor<fp16, [1, 1, 256]> quantized_1_cast_fp16_cast_uint16_cast_uint16 = gather(axis = quantized_1_cast_fp16_cast_uint16_axis_0, batch_dims = quantized_1_batch_dims_0, indices = select_0_to_uint16, validate_indices = quantized_1_validate_indices_0, x = weight_1_to_fp16_palettized)[name = string("quantized_1_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [3]> var_304 = const()[name = string("op_304"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_3_axes_0 = const()[name = string("input_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_305_cast_fp16 = transpose(perm = var_304, x = quantized_1_cast_fp16_cast_uint16_cast_uint16)[name = string("transpose_28")];
+            tensor<fp16, [1, 256, 1, 1]> input_3_cast_fp16 = expand_dims(axes = input_3_axes_0, x = var_305_cast_fp16)[name = string("input_3_cast_fp16")];
+            string quantized_pad_type_0 = const()[name = string("quantized_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> quantized_strides_0 = const()[name = string("quantized_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> quantized_pad_0 = const()[name = string("quantized_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> quantized_dilations_0 = const()[name = string("quantized_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 quantized_groups_0 = const()[name = string("quantized_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 256, 1, 1]> decoder_quantizer_rvq_first_output_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(524992))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(656128))))[name = string("decoder_quantizer_rvq_first_output_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 512, 1, 1]> quantized_cast_fp16 = conv(dilations = quantized_dilations_0, groups = quantized_groups_0, pad = quantized_pad_0, pad_type = quantized_pad_type_0, strides = quantized_strides_0, weight = decoder_quantizer_rvq_first_output_proj_weight_to_fp16_palettized, x = input_3_cast_fp16)[name = string("quantized_cast_fp16")];
+            tensor<int32, [3]> codes_begin_0 = const()[name = string("codes_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> codes_end_0 = const()[name = string("codes_end_0"), val = tensor<int32, [3]>([1, 16, 1])];
+            tensor<bool, [3]> codes_end_mask_0 = const()[name = string("codes_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<int32, [1, 15, 1]> codes = slice_by_index(begin = codes_begin_0, end = codes_end_0, end_mask = codes_end_mask_0, x = audio_codes)[name = string("codes")];
+            tensor<int32, [3]> var_315 = const()[name = string("op_315"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [15]> var_317_split_sizes_0 = const()[name = string("op_317_split_sizes_0"), val = tensor<int32, [15]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(656704)))];
+            int32 var_317_axis_0 = const()[name = string("op_317_axis_0"), val = int32(0)];
+            tensor<int32, [15, 1, 1]> var_316 = transpose(perm = var_315, x = codes)[name = string("transpose_27")];
+            tensor<int32, [1, 1, 1]> var_317_0, tensor<int32, [1, 1, 1]> var_317_1, tensor<int32, [1, 1, 1]> var_317_2, tensor<int32, [1, 1, 1]> var_317_3, tensor<int32, [1, 1, 1]> var_317_4, tensor<int32, [1, 1, 1]> var_317_5, tensor<int32, [1, 1, 1]> var_317_6, tensor<int32, [1, 1, 1]> var_317_7, tensor<int32, [1, 1, 1]> var_317_8, tensor<int32, [1, 1, 1]> var_317_9, tensor<int32, [1, 1, 1]> var_317_10, tensor<int32, [1, 1, 1]> var_317_11, tensor<int32, [1, 1, 1]> var_317_12, tensor<int32, [1, 1, 1]> var_317_13, tensor<int32, [1, 1, 1]> var_317_14 = split(axis = var_317_axis_0, split_sizes = var_317_split_sizes_0, x = var_316)[name = string("op_317")];
+            tensor<int32, [1]> squeeze_1_axes_0 = const()[name = string("squeeze_1_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_0_to_uint16_dtype_0 = const()[name = string("op_317_0_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_0_to_uint16 = cast(dtype = var_317_0_to_uint16_dtype_0, x = var_317_0)[name = string("cast_15")];
+            tensor<uint16, [1, 1]> squeeze_1_cast_uint16 = squeeze(axes = squeeze_1_axes_0, x = var_317_0_to_uint16)[name = string("squeeze_1_cast_uint16")];
+            tensor<int32, [1]> squeeze_2_axes_0 = const()[name = string("squeeze_2_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_1_to_uint16_dtype_0 = const()[name = string("op_317_1_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_1_to_uint16 = cast(dtype = var_317_1_to_uint16_dtype_0, x = var_317_1)[name = string("cast_14")];
+            tensor<uint16, [1, 1]> squeeze_2_cast_uint16 = squeeze(axes = squeeze_2_axes_0, x = var_317_1_to_uint16)[name = string("squeeze_2_cast_uint16")];
+            tensor<int32, [1]> squeeze_3_axes_0 = const()[name = string("squeeze_3_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_2_to_uint16_dtype_0 = const()[name = string("op_317_2_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_2_to_uint16 = cast(dtype = var_317_2_to_uint16_dtype_0, x = var_317_2)[name = string("cast_13")];
+            tensor<uint16, [1, 1]> squeeze_3_cast_uint16 = squeeze(axes = squeeze_3_axes_0, x = var_317_2_to_uint16)[name = string("squeeze_3_cast_uint16")];
+            tensor<int32, [1]> squeeze_4_axes_0 = const()[name = string("squeeze_4_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_3_to_uint16_dtype_0 = const()[name = string("op_317_3_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_3_to_uint16 = cast(dtype = var_317_3_to_uint16_dtype_0, x = var_317_3)[name = string("cast_12")];
+            tensor<uint16, [1, 1]> squeeze_4_cast_uint16 = squeeze(axes = squeeze_4_axes_0, x = var_317_3_to_uint16)[name = string("squeeze_4_cast_uint16")];
+            tensor<int32, [1]> squeeze_5_axes_0 = const()[name = string("squeeze_5_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_4_to_uint16_dtype_0 = const()[name = string("op_317_4_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_4_to_uint16 = cast(dtype = var_317_4_to_uint16_dtype_0, x = var_317_4)[name = string("cast_11")];
+            tensor<uint16, [1, 1]> squeeze_5_cast_uint16 = squeeze(axes = squeeze_5_axes_0, x = var_317_4_to_uint16)[name = string("squeeze_5_cast_uint16")];
+            tensor<int32, [1]> squeeze_6_axes_0 = const()[name = string("squeeze_6_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_5_to_uint16_dtype_0 = const()[name = string("op_317_5_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_5_to_uint16 = cast(dtype = var_317_5_to_uint16_dtype_0, x = var_317_5)[name = string("cast_10")];
+            tensor<uint16, [1, 1]> squeeze_6_cast_uint16 = squeeze(axes = squeeze_6_axes_0, x = var_317_5_to_uint16)[name = string("squeeze_6_cast_uint16")];
+            tensor<int32, [1]> squeeze_7_axes_0 = const()[name = string("squeeze_7_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_6_to_uint16_dtype_0 = const()[name = string("op_317_6_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_6_to_uint16 = cast(dtype = var_317_6_to_uint16_dtype_0, x = var_317_6)[name = string("cast_9")];
+            tensor<uint16, [1, 1]> squeeze_7_cast_uint16 = squeeze(axes = squeeze_7_axes_0, x = var_317_6_to_uint16)[name = string("squeeze_7_cast_uint16")];
+            tensor<int32, [1]> squeeze_8_axes_0 = const()[name = string("squeeze_8_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_7_to_uint16_dtype_0 = const()[name = string("op_317_7_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_7_to_uint16 = cast(dtype = var_317_7_to_uint16_dtype_0, x = var_317_7)[name = string("cast_8")];
+            tensor<uint16, [1, 1]> squeeze_8_cast_uint16 = squeeze(axes = squeeze_8_axes_0, x = var_317_7_to_uint16)[name = string("squeeze_8_cast_uint16")];
+            tensor<int32, [1]> squeeze_9_axes_0 = const()[name = string("squeeze_9_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_8_to_uint16_dtype_0 = const()[name = string("op_317_8_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_8_to_uint16 = cast(dtype = var_317_8_to_uint16_dtype_0, x = var_317_8)[name = string("cast_7")];
+            tensor<uint16, [1, 1]> squeeze_9_cast_uint16 = squeeze(axes = squeeze_9_axes_0, x = var_317_8_to_uint16)[name = string("squeeze_9_cast_uint16")];
+            tensor<int32, [1]> squeeze_10_axes_0 = const()[name = string("squeeze_10_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_9_to_uint16_dtype_0 = const()[name = string("op_317_9_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_9_to_uint16 = cast(dtype = var_317_9_to_uint16_dtype_0, x = var_317_9)[name = string("cast_6")];
+            tensor<uint16, [1, 1]> squeeze_10_cast_uint16 = squeeze(axes = squeeze_10_axes_0, x = var_317_9_to_uint16)[name = string("squeeze_10_cast_uint16")];
+            tensor<int32, [1]> squeeze_11_axes_0 = const()[name = string("squeeze_11_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_10_to_uint16_dtype_0 = const()[name = string("op_317_10_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_10_to_uint16 = cast(dtype = var_317_10_to_uint16_dtype_0, x = var_317_10)[name = string("cast_5")];
+            tensor<uint16, [1, 1]> squeeze_11_cast_uint16 = squeeze(axes = squeeze_11_axes_0, x = var_317_10_to_uint16)[name = string("squeeze_11_cast_uint16")];
+            tensor<int32, [1]> squeeze_12_axes_0 = const()[name = string("squeeze_12_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_11_to_uint16_dtype_0 = const()[name = string("op_317_11_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_11_to_uint16 = cast(dtype = var_317_11_to_uint16_dtype_0, x = var_317_11)[name = string("cast_4")];
+            tensor<uint16, [1, 1]> squeeze_12_cast_uint16 = squeeze(axes = squeeze_12_axes_0, x = var_317_11_to_uint16)[name = string("squeeze_12_cast_uint16")];
+            tensor<int32, [1]> squeeze_13_axes_0 = const()[name = string("squeeze_13_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_12_to_uint16_dtype_0 = const()[name = string("op_317_12_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_12_to_uint16 = cast(dtype = var_317_12_to_uint16_dtype_0, x = var_317_12)[name = string("cast_3")];
+            tensor<uint16, [1, 1]> squeeze_13_cast_uint16 = squeeze(axes = squeeze_13_axes_0, x = var_317_12_to_uint16)[name = string("squeeze_13_cast_uint16")];
+            tensor<int32, [1]> squeeze_14_axes_0 = const()[name = string("squeeze_14_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_13_to_uint16_dtype_0 = const()[name = string("op_317_13_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_13_to_uint16 = cast(dtype = var_317_13_to_uint16_dtype_0, x = var_317_13)[name = string("cast_2")];
+            tensor<uint16, [1, 1]> squeeze_14_cast_uint16 = squeeze(axes = squeeze_14_axes_0, x = var_317_13_to_uint16)[name = string("squeeze_14_cast_uint16")];
+            tensor<int32, [1]> squeeze_15_axes_0 = const()[name = string("squeeze_15_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_14_to_uint16_dtype_0 = const()[name = string("op_317_14_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_14_to_uint16 = cast(dtype = var_317_14_to_uint16_dtype_0, x = var_317_14)[name = string("cast_1")];
+            tensor<uint16, [1, 1]> squeeze_15_cast_uint16 = squeeze(axes = squeeze_15_axes_0, x = var_317_14_to_uint16)[name = string("squeeze_15_cast_uint16")];
+            int32 quantized_3_axis_0 = const()[name = string("quantized_3_axis_0"), val = int32(0)];
+            int32 quantized_3_batch_dims_0 = const()[name = string("quantized_3_batch_dims_0"), val = int32(0)];
+            bool quantized_3_validate_indices_0 = const()[name = string("quantized_3_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_5_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(656832))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1181184))))[name = string("weight_5_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_3_cast_fp16_cast_uint16 = gather(axis = quantized_3_axis_0, batch_dims = quantized_3_batch_dims_0, indices = squeeze_1_cast_uint16, validate_indices = quantized_3_validate_indices_0, x = weight_5_to_fp16_palettized)[name = string("quantized_3_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_338 = const()[name = string("op_338"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> quantized_7_axes_0 = const()[name = string("quantized_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_339_cast_fp16 = transpose(perm = var_338, x = quantized_3_cast_fp16_cast_uint16)[name = string("transpose_26")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_7_cast_fp16 = expand_dims(axes = quantized_7_axes_0, x = var_339_cast_fp16)[name = string("quantized_7_cast_fp16")];
+            int32 quantized_5_axis_0 = const()[name = string("quantized_5_axis_0"), val = int32(0)];
+            int32 quantized_5_batch_dims_0 = const()[name = string("quantized_5_batch_dims_0"), val = int32(0)];
+            bool quantized_5_validate_indices_0 = const()[name = string("quantized_5_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_7_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1181760))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1706112))))[name = string("weight_7_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_5_cast_fp16_cast_uint16 = gather(axis = quantized_5_axis_0, batch_dims = quantized_5_batch_dims_0, indices = squeeze_2_cast_uint16, validate_indices = quantized_5_validate_indices_0, x = weight_7_to_fp16_palettized)[name = string("quantized_5_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_346 = const()[name = string("op_346"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_1_axes_0 = const()[name = string("layer_out_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_347_cast_fp16 = transpose(perm = var_346, x = quantized_5_cast_fp16_cast_uint16)[name = string("transpose_25")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_1_cast_fp16 = expand_dims(axes = layer_out_1_axes_0, x = var_347_cast_fp16)[name = string("layer_out_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_11_cast_fp16 = add(x = quantized_7_cast_fp16, y = layer_out_1_cast_fp16)[name = string("quantized_11_cast_fp16")];
+            int32 quantized_9_axis_0 = const()[name = string("quantized_9_axis_0"), val = int32(0)];
+            int32 quantized_9_batch_dims_0 = const()[name = string("quantized_9_batch_dims_0"), val = int32(0)];
+            bool quantized_9_validate_indices_0 = const()[name = string("quantized_9_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_9_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1706688))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2231040))))[name = string("weight_9_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_9_cast_fp16_cast_uint16 = gather(axis = quantized_9_axis_0, batch_dims = quantized_9_batch_dims_0, indices = squeeze_3_cast_uint16, validate_indices = quantized_9_validate_indices_0, x = weight_9_to_fp16_palettized)[name = string("quantized_9_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_355 = const()[name = string("op_355"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_3_axes_0 = const()[name = string("layer_out_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_356_cast_fp16 = transpose(perm = var_355, x = quantized_9_cast_fp16_cast_uint16)[name = string("transpose_24")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_3_cast_fp16 = expand_dims(axes = layer_out_3_axes_0, x = var_356_cast_fp16)[name = string("layer_out_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_15_cast_fp16 = add(x = quantized_11_cast_fp16, y = layer_out_3_cast_fp16)[name = string("quantized_15_cast_fp16")];
+            int32 quantized_13_axis_0 = const()[name = string("quantized_13_axis_0"), val = int32(0)];
+            int32 quantized_13_batch_dims_0 = const()[name = string("quantized_13_batch_dims_0"), val = int32(0)];
+            bool quantized_13_validate_indices_0 = const()[name = string("quantized_13_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_11_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2231616))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2755968))))[name = string("weight_11_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_13_cast_fp16_cast_uint16 = gather(axis = quantized_13_axis_0, batch_dims = quantized_13_batch_dims_0, indices = squeeze_4_cast_uint16, validate_indices = quantized_13_validate_indices_0, x = weight_11_to_fp16_palettized)[name = string("quantized_13_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_364 = const()[name = string("op_364"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_5_axes_0 = const()[name = string("layer_out_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_365_cast_fp16 = transpose(perm = var_364, x = quantized_13_cast_fp16_cast_uint16)[name = string("transpose_23")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_5_cast_fp16 = expand_dims(axes = layer_out_5_axes_0, x = var_365_cast_fp16)[name = string("layer_out_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_19_cast_fp16 = add(x = quantized_15_cast_fp16, y = layer_out_5_cast_fp16)[name = string("quantized_19_cast_fp16")];
+            int32 quantized_17_axis_0 = const()[name = string("quantized_17_axis_0"), val = int32(0)];
+            int32 quantized_17_batch_dims_0 = const()[name = string("quantized_17_batch_dims_0"), val = int32(0)];
+            bool quantized_17_validate_indices_0 = const()[name = string("quantized_17_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_13_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2756544))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3280896))))[name = string("weight_13_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_17_cast_fp16_cast_uint16 = gather(axis = quantized_17_axis_0, batch_dims = quantized_17_batch_dims_0, indices = squeeze_5_cast_uint16, validate_indices = quantized_17_validate_indices_0, x = weight_13_to_fp16_palettized)[name = string("quantized_17_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_373 = const()[name = string("op_373"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_7_axes_0 = const()[name = string("layer_out_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_374_cast_fp16 = transpose(perm = var_373, x = quantized_17_cast_fp16_cast_uint16)[name = string("transpose_22")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_7_cast_fp16 = expand_dims(axes = layer_out_7_axes_0, x = var_374_cast_fp16)[name = string("layer_out_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_23_cast_fp16 = add(x = quantized_19_cast_fp16, y = layer_out_7_cast_fp16)[name = string("quantized_23_cast_fp16")];
+            int32 quantized_21_axis_0 = const()[name = string("quantized_21_axis_0"), val = int32(0)];
+            int32 quantized_21_batch_dims_0 = const()[name = string("quantized_21_batch_dims_0"), val = int32(0)];
+            bool quantized_21_validate_indices_0 = const()[name = string("quantized_21_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_15_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3281472))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3805824))))[name = string("weight_15_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_21_cast_fp16_cast_uint16 = gather(axis = quantized_21_axis_0, batch_dims = quantized_21_batch_dims_0, indices = squeeze_6_cast_uint16, validate_indices = quantized_21_validate_indices_0, x = weight_15_to_fp16_palettized)[name = string("quantized_21_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_382 = const()[name = string("op_382"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_9_axes_0 = const()[name = string("layer_out_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_383_cast_fp16 = transpose(perm = var_382, x = quantized_21_cast_fp16_cast_uint16)[name = string("transpose_21")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_9_cast_fp16 = expand_dims(axes = layer_out_9_axes_0, x = var_383_cast_fp16)[name = string("layer_out_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_27_cast_fp16 = add(x = quantized_23_cast_fp16, y = layer_out_9_cast_fp16)[name = string("quantized_27_cast_fp16")];
+            int32 quantized_25_axis_0 = const()[name = string("quantized_25_axis_0"), val = int32(0)];
+            int32 quantized_25_batch_dims_0 = const()[name = string("quantized_25_batch_dims_0"), val = int32(0)];
+            bool quantized_25_validate_indices_0 = const()[name = string("quantized_25_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_17_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3806400))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4330752))))[name = string("weight_17_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_25_cast_fp16_cast_uint16 = gather(axis = quantized_25_axis_0, batch_dims = quantized_25_batch_dims_0, indices = squeeze_7_cast_uint16, validate_indices = quantized_25_validate_indices_0, x = weight_17_to_fp16_palettized)[name = string("quantized_25_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_391 = const()[name = string("op_391"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_11_axes_0 = const()[name = string("layer_out_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_392_cast_fp16 = transpose(perm = var_391, x = quantized_25_cast_fp16_cast_uint16)[name = string("transpose_20")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_11_cast_fp16 = expand_dims(axes = layer_out_11_axes_0, x = var_392_cast_fp16)[name = string("layer_out_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_31_cast_fp16 = add(x = quantized_27_cast_fp16, y = layer_out_11_cast_fp16)[name = string("quantized_31_cast_fp16")];
+            int32 quantized_29_axis_0 = const()[name = string("quantized_29_axis_0"), val = int32(0)];
+            int32 quantized_29_batch_dims_0 = const()[name = string("quantized_29_batch_dims_0"), val = int32(0)];
+            bool quantized_29_validate_indices_0 = const()[name = string("quantized_29_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_19_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4331328))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4855680))))[name = string("weight_19_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_29_cast_fp16_cast_uint16 = gather(axis = quantized_29_axis_0, batch_dims = quantized_29_batch_dims_0, indices = squeeze_8_cast_uint16, validate_indices = quantized_29_validate_indices_0, x = weight_19_to_fp16_palettized)[name = string("quantized_29_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_400 = const()[name = string("op_400"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_13_axes_0 = const()[name = string("layer_out_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_401_cast_fp16 = transpose(perm = var_400, x = quantized_29_cast_fp16_cast_uint16)[name = string("transpose_19")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_13_cast_fp16 = expand_dims(axes = layer_out_13_axes_0, x = var_401_cast_fp16)[name = string("layer_out_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_35_cast_fp16 = add(x = quantized_31_cast_fp16, y = layer_out_13_cast_fp16)[name = string("quantized_35_cast_fp16")];
+            int32 quantized_33_axis_0 = const()[name = string("quantized_33_axis_0"), val = int32(0)];
+            int32 quantized_33_batch_dims_0 = const()[name = string("quantized_33_batch_dims_0"), val = int32(0)];
+            bool quantized_33_validate_indices_0 = const()[name = string("quantized_33_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_21_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4856256))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5380608))))[name = string("weight_21_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_33_cast_fp16_cast_uint16 = gather(axis = quantized_33_axis_0, batch_dims = quantized_33_batch_dims_0, indices = squeeze_9_cast_uint16, validate_indices = quantized_33_validate_indices_0, x = weight_21_to_fp16_palettized)[name = string("quantized_33_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_409 = const()[name = string("op_409"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_15_axes_0 = const()[name = string("layer_out_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_410_cast_fp16 = transpose(perm = var_409, x = quantized_33_cast_fp16_cast_uint16)[name = string("transpose_18")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_15_cast_fp16 = expand_dims(axes = layer_out_15_axes_0, x = var_410_cast_fp16)[name = string("layer_out_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_39_cast_fp16 = add(x = quantized_35_cast_fp16, y = layer_out_15_cast_fp16)[name = string("quantized_39_cast_fp16")];
+            int32 quantized_37_axis_0 = const()[name = string("quantized_37_axis_0"), val = int32(0)];
+            int32 quantized_37_batch_dims_0 = const()[name = string("quantized_37_batch_dims_0"), val = int32(0)];
+            bool quantized_37_validate_indices_0 = const()[name = string("quantized_37_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_23_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5381184))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5905536))))[name = string("weight_23_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_37_cast_fp16_cast_uint16 = gather(axis = quantized_37_axis_0, batch_dims = quantized_37_batch_dims_0, indices = squeeze_10_cast_uint16, validate_indices = quantized_37_validate_indices_0, x = weight_23_to_fp16_palettized)[name = string("quantized_37_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_418 = const()[name = string("op_418"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_17_axes_0 = const()[name = string("layer_out_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_419_cast_fp16 = transpose(perm = var_418, x = quantized_37_cast_fp16_cast_uint16)[name = string("transpose_17")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_17_cast_fp16 = expand_dims(axes = layer_out_17_axes_0, x = var_419_cast_fp16)[name = string("layer_out_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_43_cast_fp16 = add(x = quantized_39_cast_fp16, y = layer_out_17_cast_fp16)[name = string("quantized_43_cast_fp16")];
+            int32 quantized_41_axis_0 = const()[name = string("quantized_41_axis_0"), val = int32(0)];
+            int32 quantized_41_batch_dims_0 = const()[name = string("quantized_41_batch_dims_0"), val = int32(0)];
+            bool quantized_41_validate_indices_0 = const()[name = string("quantized_41_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_25_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5906112))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6430464))))[name = string("weight_25_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_41_cast_fp16_cast_uint16 = gather(axis = quantized_41_axis_0, batch_dims = quantized_41_batch_dims_0, indices = squeeze_11_cast_uint16, validate_indices = quantized_41_validate_indices_0, x = weight_25_to_fp16_palettized)[name = string("quantized_41_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_427 = const()[name = string("op_427"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_19_axes_0 = const()[name = string("layer_out_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_428_cast_fp16 = transpose(perm = var_427, x = quantized_41_cast_fp16_cast_uint16)[name = string("transpose_16")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_19_cast_fp16 = expand_dims(axes = layer_out_19_axes_0, x = var_428_cast_fp16)[name = string("layer_out_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_47_cast_fp16 = add(x = quantized_43_cast_fp16, y = layer_out_19_cast_fp16)[name = string("quantized_47_cast_fp16")];
+            int32 quantized_45_axis_0 = const()[name = string("quantized_45_axis_0"), val = int32(0)];
+            int32 quantized_45_batch_dims_0 = const()[name = string("quantized_45_batch_dims_0"), val = int32(0)];
+            bool quantized_45_validate_indices_0 = const()[name = string("quantized_45_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_27_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6431040))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6955392))))[name = string("weight_27_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_45_cast_fp16_cast_uint16 = gather(axis = quantized_45_axis_0, batch_dims = quantized_45_batch_dims_0, indices = squeeze_12_cast_uint16, validate_indices = quantized_45_validate_indices_0, x = weight_27_to_fp16_palettized)[name = string("quantized_45_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_436 = const()[name = string("op_436"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_21_axes_0 = const()[name = string("layer_out_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_437_cast_fp16 = transpose(perm = var_436, x = quantized_45_cast_fp16_cast_uint16)[name = string("transpose_15")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_21_cast_fp16 = expand_dims(axes = layer_out_21_axes_0, x = var_437_cast_fp16)[name = string("layer_out_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_51_cast_fp16 = add(x = quantized_47_cast_fp16, y = layer_out_21_cast_fp16)[name = string("quantized_51_cast_fp16")];
+            int32 quantized_49_axis_0 = const()[name = string("quantized_49_axis_0"), val = int32(0)];
+            int32 quantized_49_batch_dims_0 = const()[name = string("quantized_49_batch_dims_0"), val = int32(0)];
+            bool quantized_49_validate_indices_0 = const()[name = string("quantized_49_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_29_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6955968))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7480320))))[name = string("weight_29_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_49_cast_fp16_cast_uint16 = gather(axis = quantized_49_axis_0, batch_dims = quantized_49_batch_dims_0, indices = squeeze_13_cast_uint16, validate_indices = quantized_49_validate_indices_0, x = weight_29_to_fp16_palettized)[name = string("quantized_49_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_445 = const()[name = string("op_445"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_23_axes_0 = const()[name = string("layer_out_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_446_cast_fp16 = transpose(perm = var_445, x = quantized_49_cast_fp16_cast_uint16)[name = string("transpose_14")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_23_cast_fp16 = expand_dims(axes = layer_out_23_axes_0, x = var_446_cast_fp16)[name = string("layer_out_23_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_55_cast_fp16 = add(x = quantized_51_cast_fp16, y = layer_out_23_cast_fp16)[name = string("quantized_55_cast_fp16")];
+            int32 quantized_53_axis_0 = const()[name = string("quantized_53_axis_0"), val = int32(0)];
+            int32 quantized_53_batch_dims_0 = const()[name = string("quantized_53_batch_dims_0"), val = int32(0)];
+            bool quantized_53_validate_indices_0 = const()[name = string("quantized_53_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_31_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7480896))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8005248))))[name = string("weight_31_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_53_cast_fp16_cast_uint16 = gather(axis = quantized_53_axis_0, batch_dims = quantized_53_batch_dims_0, indices = squeeze_14_cast_uint16, validate_indices = quantized_53_validate_indices_0, x = weight_31_to_fp16_palettized)[name = string("quantized_53_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_454 = const()[name = string("op_454"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_25_axes_0 = const()[name = string("layer_out_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_455_cast_fp16 = transpose(perm = var_454, x = quantized_53_cast_fp16_cast_uint16)[name = string("transpose_13")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_25_cast_fp16 = expand_dims(axes = layer_out_25_axes_0, x = var_455_cast_fp16)[name = string("layer_out_25_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_59_cast_fp16 = add(x = quantized_55_cast_fp16, y = layer_out_25_cast_fp16)[name = string("quantized_59_cast_fp16")];
+            int32 quantized_57_axis_0 = const()[name = string("quantized_57_axis_0"), val = int32(0)];
+            int32 quantized_57_batch_dims_0 = const()[name = string("quantized_57_batch_dims_0"), val = int32(0)];
+            bool quantized_57_validate_indices_0 = const()[name = string("quantized_57_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_33_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8005824))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8530176))))[name = string("weight_33_to_fp16_palettized")];
+            tensor<fp16, [1, 1, 256]> quantized_57_cast_fp16_cast_uint16 = gather(axis = quantized_57_axis_0, batch_dims = quantized_57_batch_dims_0, indices = squeeze_15_cast_uint16, validate_indices = quantized_57_validate_indices_0, x = weight_33_to_fp16_palettized)[name = string("quantized_57_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_463 = const()[name = string("op_463"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_axes_0 = const()[name = string("layer_out_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_464_cast_fp16 = transpose(perm = var_463, x = quantized_57_cast_fp16_cast_uint16)[name = string("transpose_12")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_cast_fp16 = expand_dims(axes = layer_out_axes_0, x = var_464_cast_fp16)[name = string("layer_out_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_35_cast_fp16 = add(x = quantized_59_cast_fp16, y = layer_out_cast_fp16)[name = string("input_35_cast_fp16")];
+            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_472_strides_0 = const()[name = string("op_472_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_472_dilations_0 = const()[name = string("op_472_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_472_groups_0 = const()[name = string("op_472_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 256, 1, 1]> decoder_quantizer_rvq_rest_output_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8530752))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8661888))))[name = string("decoder_quantizer_rvq_rest_output_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 512, 1, 1]> var_472_cast_fp16 = conv(dilations = var_472_dilations_0, groups = var_472_groups_0, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_472_strides_0, weight = decoder_quantizer_rvq_rest_output_proj_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> x_1_cast_fp16 = add(x = quantized_cast_fp16, y = var_472_cast_fp16)[name = string("x_1_cast_fp16")];
+            tensor<int32, [8]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 2, 0])];
+            string input_37_mode_0 = const()[name = string("input_37_mode_0"), val = string("constant")];
+            fp16 const_16_to_fp16 = const()[name = string("const_16_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 512, 1, 3]> input_37_cast_fp16 = pad(constant_val = const_16_to_fp16, mode = input_37_mode_0, pad = input_37_pad_0, x = x_1_cast_fp16)[name = string("input_37_cast_fp16")];
+            string input_39_pad_type_0 = const()[name = string("input_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_39_strides_0 = const()[name = string("input_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_39_pad_0 = const()[name = string("input_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_39_dilations_0 = const()[name = string("input_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_39_groups_0 = const()[name = string("input_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 3]> decoder_pre_conv_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 3]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8662464))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10235392))))[name = string("decoder_pre_conv_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> decoder_pre_conv_conv_bias_to_fp16 = const()[name = string("decoder_pre_conv_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10235968)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_39_cast_fp16 = conv(bias = decoder_pre_conv_conv_bias_to_fp16, dilations = input_39_dilations_0, groups = input_39_groups_0, pad = input_39_pad_0, pad_type = input_39_pad_type_0, strides = input_39_strides_0, weight = decoder_pre_conv_conv_weight_to_fp16_palettized, x = input_37_cast_fp16)[name = string("input_39_cast_fp16")];
+            string inputs_1_pad_type_0 = const()[name = string("inputs_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> inputs_1_strides_0 = const()[name = string("inputs_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> inputs_1_pad_0 = const()[name = string("inputs_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> inputs_1_dilations_0 = const()[name = string("inputs_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 inputs_1_groups_0 = const()[name = string("inputs_1_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> decoder_pre_transformer_input_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10238080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10762432))))[name = string("decoder_pre_transformer_input_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [512]> decoder_pre_transformer_input_proj_bias_to_fp16 = const()[name = string("decoder_pre_transformer_input_proj_bias_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10763008)))];
+            tensor<fp16, [1, 512, 1, 1]> inputs_1_cast_fp16 = conv(bias = decoder_pre_transformer_input_proj_bias_to_fp16, dilations = inputs_1_dilations_0, groups = inputs_1_groups_0, pad = inputs_1_pad_0, pad_type = inputs_1_pad_type_0, strides = inputs_1_strides_0, weight = decoder_pre_transformer_input_proj_weight_to_fp16_palettized, x = input_39_cast_fp16)[name = string("inputs_1_cast_fp16")];
+            int32 pos_cos_1_axis_0 = const()[name = string("pos_cos_1_axis_0"), val = int32(0)];
+            int32 pos_cos_1_batch_dims_0 = const()[name = string("pos_cos_1_batch_dims_0"), val = int32(0)];
+            bool pos_cos_1_validate_indices_0 = const()[name = string("pos_cos_1_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 64]> decoder_pre_transformer_position_embeddings_cos_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [256, 64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10764096))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10780544))))[name = string("decoder_pre_transformer_position_embeddings_cos_weight_to_fp16_palettized")];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_0")];
+            tensor<fp16, [1, 64]> pos_cos_1_cast_fp16_cast_uint16 = gather(axis = pos_cos_1_axis_0, batch_dims = pos_cos_1_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_cos_1_validate_indices_0, x = decoder_pre_transformer_position_embeddings_cos_weight_to_fp16_palettized)[name = string("pos_cos_1_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> pos_cos_axes_0 = const()[name = string("pos_cos_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 64, 1]> pos_cos_cast_fp16 = expand_dims(axes = pos_cos_axes_0, x = pos_cos_1_cast_fp16_cast_uint16)[name = string("pos_cos_cast_fp16")];
+            int32 pos_sin_1_axis_0 = const()[name = string("pos_sin_1_axis_0"), val = int32(0)];
+            int32 pos_sin_1_batch_dims_0 = const()[name = string("pos_sin_1_batch_dims_0"), val = int32(0)];
+            bool pos_sin_1_validate_indices_0 = const()[name = string("pos_sin_1_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 64]> decoder_pre_transformer_position_embeddings_sin_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [256, 64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10781120))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10797568))))[name = string("decoder_pre_transformer_position_embeddings_sin_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 64]> pos_sin_1_cast_fp16_cast_uint16 = gather(axis = pos_sin_1_axis_0, batch_dims = pos_sin_1_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_1_validate_indices_0, x = decoder_pre_transformer_position_embeddings_sin_weight_to_fp16_palettized)[name = string("pos_sin_1_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> pos_sin_axes_0 = const()[name = string("pos_sin_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 64, 1]> pos_sin_cast_fp16 = expand_dims(axes = pos_sin_axes_0, x = pos_sin_1_cast_fp16_cast_uint16)[name = string("pos_sin_cast_fp16")];
+            tensor<int32, [8]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [8]>([1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024])];
+            int32 var_522_axis_0 = const()[name = string("op_522_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_7 = split(axis = var_522_axis_0, split_sizes = tile_0, x = key_cache)[name = string("op_522_cast_fp16")];
+            tensor<int32, [8]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [8]>([1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024])];
+            int32 var_531_axis_0 = const()[name = string("op_531_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_7 = split(axis = var_531_axis_0, split_sizes = tile_1, x = value_cache)[name = string("op_531_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_550_to_fp16 = const()[name = string("op_550_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_551_cast_fp16 = add(x = variance_1_cast_fp16, y = var_550_to_fp16)[name = string("op_551_cast_fp16")];
+            fp32 var_552_epsilon_0 = const()[name = string("op_552_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_552_cast_fp16 = rsqrt(epsilon = var_552_epsilon_0, x = var_551_cast_fp16)[name = string("op_552_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_552_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10798144)))];
+            tensor<fp16, [1, 512, 1, 1]> x_3_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("x_3_cast_fp16")];
+            string q_1_pad_type_0 = const()[name = string("q_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_1_strides_0 = const()[name = string("q_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = string("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_1_dilations_0 = const()[name = string("q_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_1_groups_0 = const()[name = string("q_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10799232))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11323584))))[name = string("decoder_pre_transformer_layers_0_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_1_cast_fp16 = conv(dilations = q_1_dilations_0, groups = q_1_groups_0, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = q_1_strides_0, weight = decoder_pre_transformer_layers_0_self_attn_q_proj_weight_to_fp16_palettized, x = x_3_cast_fp16)[name = string("q_1_cast_fp16")];
+            string k_1_pad_type_0 = const()[name = string("k_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_1_strides_0 = const()[name = string("k_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = string("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_1_dilations_0 = const()[name = string("k_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_1_groups_0 = const()[name = string("k_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11324160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11848512))))[name = string("decoder_pre_transformer_layers_0_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_1_cast_fp16 = conv(dilations = k_1_dilations_0, groups = k_1_groups_0, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = k_1_strides_0, weight = decoder_pre_transformer_layers_0_self_attn_k_proj_weight_to_fp16_palettized, x = x_3_cast_fp16)[name = string("k_1_cast_fp16")];
+            string v_1_pad_type_0 = const()[name = string("v_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_1_strides_0 = const()[name = string("v_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = string("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_1_dilations_0 = const()[name = string("v_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_1_groups_0 = const()[name = string("v_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11849088))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12373440))))[name = string("decoder_pre_transformer_layers_0_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_1_cast_fp16 = conv(dilations = v_1_dilations_0, groups = v_1_groups_0, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = v_1_strides_0, weight = decoder_pre_transformer_layers_0_self_attn_v_proj_weight_to_fp16_palettized, x = x_3_cast_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_584 = const()[name = string("op_584"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_3_cast_fp16 = reshape(shape = var_584, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_589 = const()[name = string("op_589"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_3_cast_fp16 = reshape(shape = var_589, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [1]> cos_expanded_1_axes_0 = const()[name = string("cos_expanded_1_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 64, 1, 1]> cos_expanded_1_cast_fp16 = expand_dims(axes = cos_expanded_1_axes_0, x = pos_cos_cast_fp16)[name = string("cos_expanded_1_cast_fp16")];
+            tensor<int32, [1]> sin_expanded_1_axes_0 = const()[name = string("sin_expanded_1_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 64, 1, 1]> sin_expanded_1_cast_fp16 = expand_dims(axes = sin_expanded_1_axes_0, x = pos_sin_cast_fp16)[name = string("sin_expanded_1_cast_fp16")];
+            tensor<int32, [4]> var_593 = const()[name = string("op_593"), val = tensor<int32, [4]>([16, 1, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> cos_1_cast_fp16 = tile(reps = var_593, x = cos_expanded_1_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [4]> var_595 = const()[name = string("op_595"), val = tensor<int32, [4]>([16, 1, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> sin_1_cast_fp16 = tile(reps = var_595, x = sin_expanded_1_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_601_cast_fp16 = mul(x = q_3_cast_fp16, y = cos_1_cast_fp16)[name = string("op_601_cast_fp16")];
+            tensor<int32, [4]> var_606_begin_0 = const()[name = string("op_606_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_606_end_0 = const()[name = string("op_606_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_606_end_mask_0 = const()[name = string("op_606_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_606_cast_fp16 = slice_by_index(begin = var_606_begin_0, end = var_606_end_0, end_mask = var_606_end_mask_0, x = q_3_cast_fp16)[name = string("op_606_cast_fp16")];
+            tensor<int32, [4]> var_613_begin_0 = const()[name = string("op_613_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_613_end_0 = const()[name = string("op_613_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_613_end_mask_0 = const()[name = string("op_613_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_613_cast_fp16 = slice_by_index(begin = var_613_begin_0, end = var_613_end_0, end_mask = var_613_end_mask_0, x = q_3_cast_fp16)[name = string("op_613_cast_fp16")];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_616_cast_fp16 = mul(x = var_613_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_616_cast_fp16")];
+            bool var_618_interleave_0 = const()[name = string("op_618_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_618_cast_fp16 = concat(axis = var_32, interleave = var_618_interleave_0, values = (var_616_cast_fp16, var_606_cast_fp16))[name = string("op_618_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_619_cast_fp16 = mul(x = var_618_cast_fp16, y = sin_1_cast_fp16)[name = string("op_619_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_1_cast_fp16 = add(x = var_601_cast_fp16, y = var_619_cast_fp16)[name = string("q_rotated_1_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_621_cast_fp16 = mul(x = k_3_cast_fp16, y = cos_1_cast_fp16)[name = string("op_621_cast_fp16")];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_626_cast_fp16 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = k_3_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_633_begin_0 = const()[name = string("op_633_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_633_end_0 = const()[name = string("op_633_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_633_end_mask_0 = const()[name = string("op_633_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_633_cast_fp16 = slice_by_index(begin = var_633_begin_0, end = var_633_end_0, end_mask = var_633_end_mask_0, x = k_3_cast_fp16)[name = string("op_633_cast_fp16")];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_636_cast_fp16 = mul(x = var_633_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_636_cast_fp16")];
+            bool var_638_interleave_0 = const()[name = string("op_638_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_638_cast_fp16 = concat(axis = var_32, interleave = var_638_interleave_0, values = (var_636_cast_fp16, var_626_cast_fp16))[name = string("op_638_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_639_cast_fp16 = mul(x = var_638_cast_fp16, y = sin_1_cast_fp16)[name = string("op_639_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_1_cast_fp16 = add(x = var_621_cast_fp16, y = var_639_cast_fp16)[name = string("k_rotated_1_cast_fp16")];
+            tensor<int32, [4]> var_643 = const()[name = string("op_643"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = reshape(shape = var_643, x = k_rotated_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            tensor<int32, [1]> var_645_axes_0 = const()[name = string("op_645_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_645_cast_fp16 = expand_dims(axes = var_645_axes_0, x = kv_cache_update_mask)[name = string("op_645_cast_fp16")];
+            tensor<int32, [1]> update_mask_1_axes_0 = const()[name = string("update_mask_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> update_mask_1_cast_fp16 = expand_dims(axes = update_mask_1_axes_0, x = var_645_cast_fp16)[name = string("update_mask_1_cast_fp16")];
+            fp16 var_32_promoted_to_fp16 = const()[name = string("op_32_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 256]> var_647_cast_fp16 = sub(x = var_32_promoted_to_fp16, y = update_mask_1_cast_fp16)[name = string("op_647_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_648_cast_fp16 = mul(x = var_522_cast_fp16_0, y = var_647_cast_fp16)[name = string("op_648_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_649_cast_fp16 = mul(x = current_key_1_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_649_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_1_cast_fp16 = add(x = var_648_cast_fp16, y = var_649_cast_fp16)[name = string("key_cache_updated_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_652_cast_fp16 = mul(x = var_531_cast_fp16_0, y = var_647_cast_fp16)[name = string("op_652_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_653_cast_fp16 = mul(x = v_1_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_653_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_1_cast_fp16 = add(x = var_652_cast_fp16, y = var_653_cast_fp16)[name = string("value_cache_updated_1_cast_fp16")];
+            tensor<int32, [4]> var_655 = const()[name = string("op_655"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_7_cast_fp16 = reshape(shape = var_655, x = q_rotated_1_cast_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [4]> var_658 = const()[name = string("op_658"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_1_cast_fp16 = reshape(shape = var_658, x = key_cache_updated_1_cast_fp16)[name = string("k_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> var_660 = const()[name = string("op_660"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_1_cast_fp16 = reshape(shape = var_660, x = value_cache_updated_1_cast_fp16)[name = string("v_for_attn_1_cast_fp16")];
+            bool var_664_transpose_x_1 = const()[name = string("op_664_transpose_x_1"), val = bool(true)];
+            bool var_664_transpose_y_1 = const()[name = string("op_664_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_664_cast_fp16 = matmul(transpose_x = var_664_transpose_x_1, transpose_y = var_664_transpose_y_1, x = q_7_cast_fp16, y = k_for_attn_1_cast_fp16)[name = string("op_664_cast_fp16")];
+            fp16 var_665_to_fp16 = const()[name = string("op_665_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_1_cast_fp16 = mul(x = var_664_cast_fp16, y = var_665_to_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<int32, [1]> var_667_axes_0 = const()[name = string("op_667_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_667_cast_fp16 = expand_dims(axes = var_667_axes_0, x = key_padding_mask)[name = string("op_667_cast_fp16")];
+            tensor<int32, [1]> attn_mask_1_axes_0 = const()[name = string("attn_mask_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> attn_mask_1_cast_fp16 = expand_dims(axes = attn_mask_1_axes_0, x = var_667_cast_fp16)[name = string("attn_mask_1_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_41_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_41_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_3_cast_fp16 = softmax(axis = var_28, x = input_41_cast_fp16)[name = string("attn_weights_3_cast_fp16")];
+            bool attn_output_1_transpose_x_1 = const()[name = string("attn_output_1_transpose_x_1"), val = bool(false)];
+            bool attn_output_1_transpose_y_1 = const()[name = string("attn_output_1_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_1, transpose_y = attn_output_1_transpose_y_1, x = attn_weights_3_cast_fp16, y = v_for_attn_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = string("op_674"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_676 = const()[name = string("op_676"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_3_cast_fp16 = transpose(perm = var_674, x = attn_output_1_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 1024, 1, 1]> input_43_cast_fp16 = reshape(shape = var_676, x = attn_output_3_cast_fp16)[name = string("input_43_cast_fp16")];
+            string x_5_pad_type_0 = const()[name = string("x_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_5_strides_0 = const()[name = string("x_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_5_pad_0 = const()[name = string("x_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_5_dilations_0 = const()[name = string("x_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_5_groups_0 = const()[name = string("x_5_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_691_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12374016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12898368))))[name = string("op_691_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_691_bias_0_to_fp16 = const()[name = string("op_691_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12898944)))];
+            tensor<fp16, [1, 512, 1, 1]> var_691_cast_fp16 = conv(bias = var_691_bias_0_to_fp16, dilations = x_5_dilations_0, groups = x_5_groups_0, pad = x_5_pad_0, pad_type = x_5_pad_type_0, strides = x_5_strides_0, weight = op_691_weight_0_to_fp16_palettized, x = input_43_cast_fp16)[name = string("op_691_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_3_cast_fp16 = add(x = inputs_1_cast_fp16, y = var_691_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_697_to_fp16 = const()[name = string("op_697_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_698_cast_fp16 = add(x = variance_3_cast_fp16, y = var_697_to_fp16)[name = string("op_698_cast_fp16")];
+            fp32 var_699_epsilon_0 = const()[name = string("op_699_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_699_cast_fp16 = rsqrt(epsilon = var_699_epsilon_0, x = var_698_cast_fp16)[name = string("op_699_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_699_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12900032)))];
+            tensor<fp16, [1, 512, 1, 1]> input_45_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string input_47_pad_type_0 = const()[name = string("input_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_47_strides_0 = const()[name = string("input_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_47_pad_0 = const()[name = string("input_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_47_dilations_0 = const()[name = string("input_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_47_groups_0 = const()[name = string("input_47_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12901120))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13425472))))[name = string("decoder_pre_transformer_layers_0_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_47_cast_fp16 = conv(dilations = input_47_dilations_0, groups = input_47_groups_0, pad = input_47_pad_0, pad_type = input_47_pad_type_0, strides = input_47_strides_0, weight = decoder_pre_transformer_layers_0_mlp_gate_proj_weight_to_fp16_palettized, x = input_45_cast_fp16)[name = string("input_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_713_cast_fp16 = silu(x = input_47_cast_fp16)[name = string("op_713_cast_fp16")];
+            string var_719_pad_type_0 = const()[name = string("op_719_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_719_strides_0 = const()[name = string("op_719_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_719_pad_0 = const()[name = string("op_719_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_719_dilations_0 = const()[name = string("op_719_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_719_groups_0 = const()[name = string("op_719_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13426048))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13950400))))[name = string("decoder_pre_transformer_layers_0_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_719_cast_fp16 = conv(dilations = var_719_dilations_0, groups = var_719_groups_0, pad = var_719_pad_0, pad_type = var_719_pad_type_0, strides = var_719_strides_0, weight = decoder_pre_transformer_layers_0_mlp_up_proj_weight_to_fp16_palettized, x = input_45_cast_fp16)[name = string("op_719_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_49_cast_fp16 = mul(x = var_713_cast_fp16, y = var_719_cast_fp16)[name = string("input_49_cast_fp16")];
+            string x_7_pad_type_0 = const()[name = string("x_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_7_strides_0 = const()[name = string("x_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_7_pad_0 = const()[name = string("x_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_7_dilations_0 = const()[name = string("x_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_7_groups_0 = const()[name = string("x_7_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_730_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13950976))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14475328))))[name = string("op_730_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_730_bias_0_to_fp16 = const()[name = string("op_730_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14475904)))];
+            tensor<fp16, [1, 512, 1, 1]> var_730_cast_fp16 = conv(bias = var_730_bias_0_to_fp16, dilations = x_7_dilations_0, groups = x_7_groups_0, pad = x_7_pad_0, pad_type = x_7_pad_type_0, strides = x_7_strides_0, weight = op_730_weight_0_to_fp16_palettized, x = input_49_cast_fp16)[name = string("op_730_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_5_cast_fp16 = add(x = inputs_3_cast_fp16, y = var_730_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_746_to_fp16 = const()[name = string("op_746_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_747_cast_fp16 = add(x = variance_5_cast_fp16, y = var_746_to_fp16)[name = string("op_747_cast_fp16")];
+            fp32 var_748_epsilon_0 = const()[name = string("op_748_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_748_cast_fp16 = rsqrt(epsilon = var_748_epsilon_0, x = var_747_cast_fp16)[name = string("op_748_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_748_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14476992)))];
+            tensor<fp16, [1, 512, 1, 1]> x_9_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            string q_9_pad_type_0 = const()[name = string("q_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_9_strides_0 = const()[name = string("q_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = string("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_9_dilations_0 = const()[name = string("q_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_9_groups_0 = const()[name = string("q_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14478080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15002432))))[name = string("decoder_pre_transformer_layers_1_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_9_cast_fp16 = conv(dilations = q_9_dilations_0, groups = q_9_groups_0, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = q_9_strides_0, weight = decoder_pre_transformer_layers_1_self_attn_q_proj_weight_to_fp16_palettized, x = x_9_cast_fp16)[name = string("q_9_cast_fp16")];
+            string k_5_pad_type_0 = const()[name = string("k_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_5_strides_0 = const()[name = string("k_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_5_pad_0 = const()[name = string("k_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_5_dilations_0 = const()[name = string("k_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_5_groups_0 = const()[name = string("k_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15003008))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15527360))))[name = string("decoder_pre_transformer_layers_1_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_5_cast_fp16 = conv(dilations = k_5_dilations_0, groups = k_5_groups_0, pad = k_5_pad_0, pad_type = k_5_pad_type_0, strides = k_5_strides_0, weight = decoder_pre_transformer_layers_1_self_attn_k_proj_weight_to_fp16_palettized, x = x_9_cast_fp16)[name = string("k_5_cast_fp16")];
+            string v_3_pad_type_0 = const()[name = string("v_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_3_strides_0 = const()[name = string("v_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_3_pad_0 = const()[name = string("v_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_3_dilations_0 = const()[name = string("v_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_3_groups_0 = const()[name = string("v_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15527936))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16052288))))[name = string("decoder_pre_transformer_layers_1_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_3_cast_fp16 = conv(dilations = v_3_dilations_0, groups = v_3_groups_0, pad = v_3_pad_0, pad_type = v_3_pad_type_0, strides = v_3_strides_0, weight = decoder_pre_transformer_layers_1_self_attn_v_proj_weight_to_fp16_palettized, x = x_9_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_780 = const()[name = string("op_780"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_11_cast_fp16 = reshape(shape = var_780, x = q_9_cast_fp16)[name = string("q_11_cast_fp16")];
+            tensor<int32, [4]> var_785 = const()[name = string("op_785"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_7_cast_fp16 = reshape(shape = var_785, x = k_5_cast_fp16)[name = string("k_7_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_797_cast_fp16 = mul(x = q_11_cast_fp16, y = cos_1_cast_fp16)[name = string("op_797_cast_fp16")];
+            tensor<int32, [4]> var_802_begin_0 = const()[name = string("op_802_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_802_end_0 = const()[name = string("op_802_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_802_end_mask_0 = const()[name = string("op_802_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_802_cast_fp16 = slice_by_index(begin = var_802_begin_0, end = var_802_end_0, end_mask = var_802_end_mask_0, x = q_11_cast_fp16)[name = string("op_802_cast_fp16")];
+            tensor<int32, [4]> var_809_begin_0 = const()[name = string("op_809_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_809_end_0 = const()[name = string("op_809_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_809_end_mask_0 = const()[name = string("op_809_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_809_cast_fp16 = slice_by_index(begin = var_809_begin_0, end = var_809_end_0, end_mask = var_809_end_mask_0, x = q_11_cast_fp16)[name = string("op_809_cast_fp16")];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_812_cast_fp16 = mul(x = var_809_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_812_cast_fp16")];
+            bool var_814_interleave_0 = const()[name = string("op_814_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_814_cast_fp16 = concat(axis = var_32, interleave = var_814_interleave_0, values = (var_812_cast_fp16, var_802_cast_fp16))[name = string("op_814_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_815_cast_fp16 = mul(x = var_814_cast_fp16, y = sin_1_cast_fp16)[name = string("op_815_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_3_cast_fp16 = add(x = var_797_cast_fp16, y = var_815_cast_fp16)[name = string("q_rotated_3_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_817_cast_fp16 = mul(x = k_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_817_cast_fp16")];
+            tensor<int32, [4]> var_822_begin_0 = const()[name = string("op_822_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_822_end_0 = const()[name = string("op_822_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_822_end_mask_0 = const()[name = string("op_822_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_822_cast_fp16 = slice_by_index(begin = var_822_begin_0, end = var_822_end_0, end_mask = var_822_end_mask_0, x = k_7_cast_fp16)[name = string("op_822_cast_fp16")];
+            tensor<int32, [4]> var_829_begin_0 = const()[name = string("op_829_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_829_end_0 = const()[name = string("op_829_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_829_end_mask_0 = const()[name = string("op_829_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_829_cast_fp16 = slice_by_index(begin = var_829_begin_0, end = var_829_end_0, end_mask = var_829_end_mask_0, x = k_7_cast_fp16)[name = string("op_829_cast_fp16")];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_832_cast_fp16 = mul(x = var_829_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_832_cast_fp16")];
+            bool var_834_interleave_0 = const()[name = string("op_834_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_834_cast_fp16 = concat(axis = var_32, interleave = var_834_interleave_0, values = (var_832_cast_fp16, var_822_cast_fp16))[name = string("op_834_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_835_cast_fp16 = mul(x = var_834_cast_fp16, y = sin_1_cast_fp16)[name = string("op_835_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_3_cast_fp16 = add(x = var_817_cast_fp16, y = var_835_cast_fp16)[name = string("k_rotated_3_cast_fp16")];
+            tensor<int32, [4]> var_839 = const()[name = string("op_839"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_839, x = k_rotated_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_844_cast_fp16 = mul(x = var_522_cast_fp16_1, y = var_647_cast_fp16)[name = string("op_844_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_845_cast_fp16 = mul(x = current_key_3_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_845_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_3_cast_fp16 = add(x = var_844_cast_fp16, y = var_845_cast_fp16)[name = string("key_cache_updated_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_848_cast_fp16 = mul(x = var_531_cast_fp16_1, y = var_647_cast_fp16)[name = string("op_848_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_849_cast_fp16 = mul(x = v_3_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_849_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_3_cast_fp16 = add(x = var_848_cast_fp16, y = var_849_cast_fp16)[name = string("value_cache_updated_3_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = string("op_851"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_15_cast_fp16 = reshape(shape = var_851, x = q_rotated_3_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_854 = const()[name = string("op_854"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_3_cast_fp16 = reshape(shape = var_854, x = key_cache_updated_3_cast_fp16)[name = string("k_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> var_856 = const()[name = string("op_856"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_3_cast_fp16 = reshape(shape = var_856, x = value_cache_updated_3_cast_fp16)[name = string("v_for_attn_3_cast_fp16")];
+            bool var_860_transpose_x_1 = const()[name = string("op_860_transpose_x_1"), val = bool(true)];
+            bool var_860_transpose_y_1 = const()[name = string("op_860_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_860_cast_fp16 = matmul(transpose_x = var_860_transpose_x_1, transpose_y = var_860_transpose_y_1, x = q_15_cast_fp16, y = k_for_attn_3_cast_fp16)[name = string("op_860_cast_fp16")];
+            fp16 var_861_to_fp16 = const()[name = string("op_861_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_5_cast_fp16 = mul(x = var_860_cast_fp16, y = var_861_to_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_51_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_51_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_7_cast_fp16 = softmax(axis = var_28, x = input_51_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            bool attn_output_5_transpose_x_1 = const()[name = string("attn_output_5_transpose_x_1"), val = bool(false)];
+            bool attn_output_5_transpose_y_1 = const()[name = string("attn_output_5_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_5_cast_fp16 = matmul(transpose_x = attn_output_5_transpose_x_1, transpose_y = attn_output_5_transpose_y_1, x = attn_weights_7_cast_fp16, y = v_for_attn_3_cast_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<int32, [4]> var_870 = const()[name = string("op_870"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_872 = const()[name = string("op_872"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_7_cast_fp16 = transpose(perm = var_870, x = attn_output_5_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [1, 1024, 1, 1]> input_53_cast_fp16 = reshape(shape = var_872, x = attn_output_7_cast_fp16)[name = string("input_53_cast_fp16")];
+            string x_11_pad_type_0 = const()[name = string("x_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_11_strides_0 = const()[name = string("x_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_11_pad_0 = const()[name = string("x_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_11_dilations_0 = const()[name = string("x_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_11_groups_0 = const()[name = string("x_11_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_887_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16052864))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16577216))))[name = string("op_887_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_887_bias_0_to_fp16 = const()[name = string("op_887_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16577792)))];
+            tensor<fp16, [1, 512, 1, 1]> var_887_cast_fp16 = conv(bias = var_887_bias_0_to_fp16, dilations = x_11_dilations_0, groups = x_11_groups_0, pad = x_11_pad_0, pad_type = x_11_pad_type_0, strides = x_11_strides_0, weight = op_887_weight_0_to_fp16_palettized, x = input_53_cast_fp16)[name = string("op_887_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = var_887_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_893_to_fp16 = const()[name = string("op_893_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_894_cast_fp16 = add(x = variance_7_cast_fp16, y = var_893_to_fp16)[name = string("op_894_cast_fp16")];
+            fp32 var_895_epsilon_0 = const()[name = string("op_895_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_895_cast_fp16 = rsqrt(epsilon = var_895_epsilon_0, x = var_894_cast_fp16)[name = string("op_895_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_895_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16578880)))];
+            tensor<fp16, [1, 512, 1, 1]> input_55_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_55_cast_fp16")];
+            string input_57_pad_type_0 = const()[name = string("input_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_57_strides_0 = const()[name = string("input_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_57_pad_0 = const()[name = string("input_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_57_dilations_0 = const()[name = string("input_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_57_groups_0 = const()[name = string("input_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16579968))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17104320))))[name = string("decoder_pre_transformer_layers_1_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_57_cast_fp16 = conv(dilations = input_57_dilations_0, groups = input_57_groups_0, pad = input_57_pad_0, pad_type = input_57_pad_type_0, strides = input_57_strides_0, weight = decoder_pre_transformer_layers_1_mlp_gate_proj_weight_to_fp16_palettized, x = input_55_cast_fp16)[name = string("input_57_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_909_cast_fp16 = silu(x = input_57_cast_fp16)[name = string("op_909_cast_fp16")];
+            string var_915_pad_type_0 = const()[name = string("op_915_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_915_strides_0 = const()[name = string("op_915_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_915_pad_0 = const()[name = string("op_915_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_915_dilations_0 = const()[name = string("op_915_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_915_groups_0 = const()[name = string("op_915_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17104896))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17629248))))[name = string("decoder_pre_transformer_layers_1_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_915_cast_fp16 = conv(dilations = var_915_dilations_0, groups = var_915_groups_0, pad = var_915_pad_0, pad_type = var_915_pad_type_0, strides = var_915_strides_0, weight = decoder_pre_transformer_layers_1_mlp_up_proj_weight_to_fp16_palettized, x = input_55_cast_fp16)[name = string("op_915_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_59_cast_fp16 = mul(x = var_909_cast_fp16, y = var_915_cast_fp16)[name = string("input_59_cast_fp16")];
+            string x_13_pad_type_0 = const()[name = string("x_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_13_strides_0 = const()[name = string("x_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_13_pad_0 = const()[name = string("x_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_13_dilations_0 = const()[name = string("x_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_13_groups_0 = const()[name = string("x_13_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_926_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17629824))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18154176))))[name = string("op_926_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_926_bias_0_to_fp16 = const()[name = string("op_926_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18154752)))];
+            tensor<fp16, [1, 512, 1, 1]> var_926_cast_fp16 = conv(bias = var_926_bias_0_to_fp16, dilations = x_13_dilations_0, groups = x_13_groups_0, pad = x_13_pad_0, pad_type = x_13_pad_type_0, strides = x_13_strides_0, weight = op_926_weight_0_to_fp16_palettized, x = input_59_cast_fp16)[name = string("op_926_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_9_cast_fp16 = add(x = inputs_7_cast_fp16, y = var_926_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_942_to_fp16 = const()[name = string("op_942_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_943_cast_fp16 = add(x = variance_9_cast_fp16, y = var_942_to_fp16)[name = string("op_943_cast_fp16")];
+            fp32 var_944_epsilon_0 = const()[name = string("op_944_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_944_cast_fp16 = rsqrt(epsilon = var_944_epsilon_0, x = var_943_cast_fp16)[name = string("op_944_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_9_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_944_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18155840)))];
+            tensor<fp16, [1, 512, 1, 1]> x_15_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_9_cast_fp16)[name = string("x_15_cast_fp16")];
+            string q_17_pad_type_0 = const()[name = string("q_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_17_strides_0 = const()[name = string("q_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_17_pad_0 = const()[name = string("q_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_17_dilations_0 = const()[name = string("q_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_17_groups_0 = const()[name = string("q_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18156928))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18681280))))[name = string("decoder_pre_transformer_layers_2_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_17_cast_fp16 = conv(dilations = q_17_dilations_0, groups = q_17_groups_0, pad = q_17_pad_0, pad_type = q_17_pad_type_0, strides = q_17_strides_0, weight = decoder_pre_transformer_layers_2_self_attn_q_proj_weight_to_fp16_palettized, x = x_15_cast_fp16)[name = string("q_17_cast_fp16")];
+            string k_9_pad_type_0 = const()[name = string("k_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_9_strides_0 = const()[name = string("k_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_9_pad_0 = const()[name = string("k_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_9_dilations_0 = const()[name = string("k_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_9_groups_0 = const()[name = string("k_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18681856))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19206208))))[name = string("decoder_pre_transformer_layers_2_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_9_cast_fp16 = conv(dilations = k_9_dilations_0, groups = k_9_groups_0, pad = k_9_pad_0, pad_type = k_9_pad_type_0, strides = k_9_strides_0, weight = decoder_pre_transformer_layers_2_self_attn_k_proj_weight_to_fp16_palettized, x = x_15_cast_fp16)[name = string("k_9_cast_fp16")];
+            string v_5_pad_type_0 = const()[name = string("v_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_5_strides_0 = const()[name = string("v_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_5_pad_0 = const()[name = string("v_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_5_dilations_0 = const()[name = string("v_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_5_groups_0 = const()[name = string("v_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19206784))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19731136))))[name = string("decoder_pre_transformer_layers_2_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_5_cast_fp16 = conv(dilations = v_5_dilations_0, groups = v_5_groups_0, pad = v_5_pad_0, pad_type = v_5_pad_type_0, strides = v_5_strides_0, weight = decoder_pre_transformer_layers_2_self_attn_v_proj_weight_to_fp16_palettized, x = x_15_cast_fp16)[name = string("v_5_cast_fp16")];
+            tensor<int32, [4]> var_976 = const()[name = string("op_976"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_19_cast_fp16 = reshape(shape = var_976, x = q_17_cast_fp16)[name = string("q_19_cast_fp16")];
+            tensor<int32, [4]> var_981 = const()[name = string("op_981"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_11_cast_fp16 = reshape(shape = var_981, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_993_cast_fp16 = mul(x = q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_993_cast_fp16")];
+            tensor<int32, [4]> var_998_begin_0 = const()[name = string("op_998_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_998_end_0 = const()[name = string("op_998_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_998_end_mask_0 = const()[name = string("op_998_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_998_cast_fp16 = slice_by_index(begin = var_998_begin_0, end = var_998_end_0, end_mask = var_998_end_mask_0, x = q_19_cast_fp16)[name = string("op_998_cast_fp16")];
+            tensor<int32, [4]> var_1005_begin_0 = const()[name = string("op_1005_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1005_end_0 = const()[name = string("op_1005_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1005_end_mask_0 = const()[name = string("op_1005_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1005_cast_fp16 = slice_by_index(begin = var_1005_begin_0, end = var_1005_end_0, end_mask = var_1005_end_mask_0, x = q_19_cast_fp16)[name = string("op_1005_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1008_cast_fp16 = mul(x = var_1005_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_1008_cast_fp16")];
+            bool var_1010_interleave_0 = const()[name = string("op_1010_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1010_cast_fp16 = concat(axis = var_32, interleave = var_1010_interleave_0, values = (var_1008_cast_fp16, var_998_cast_fp16))[name = string("op_1010_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1011_cast_fp16 = mul(x = var_1010_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1011_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_5_cast_fp16 = add(x = var_993_cast_fp16, y = var_1011_cast_fp16)[name = string("q_rotated_5_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1013_cast_fp16 = mul(x = k_11_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1013_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = string("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = string("op_1018_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = string("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_11_cast_fp16)[name = string("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1025_begin_0 = const()[name = string("op_1025_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1025_end_0 = const()[name = string("op_1025_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1025_end_mask_0 = const()[name = string("op_1025_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1025_cast_fp16 = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = k_11_cast_fp16)[name = string("op_1025_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1028_cast_fp16 = mul(x = var_1025_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_1028_cast_fp16")];
+            bool var_1030_interleave_0 = const()[name = string("op_1030_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1030_cast_fp16 = concat(axis = var_32, interleave = var_1030_interleave_0, values = (var_1028_cast_fp16, var_1018_cast_fp16))[name = string("op_1030_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1031_cast_fp16 = mul(x = var_1030_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1031_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_5_cast_fp16 = add(x = var_1013_cast_fp16, y = var_1031_cast_fp16)[name = string("k_rotated_5_cast_fp16")];
+            tensor<int32, [4]> var_1035 = const()[name = string("op_1035"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = reshape(shape = var_1035, x = k_rotated_5_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1040_cast_fp16 = mul(x = var_522_cast_fp16_2, y = var_647_cast_fp16)[name = string("op_1040_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1041_cast_fp16 = mul(x = current_key_5_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1041_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_5_cast_fp16 = add(x = var_1040_cast_fp16, y = var_1041_cast_fp16)[name = string("key_cache_updated_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1044_cast_fp16 = mul(x = var_531_cast_fp16_2, y = var_647_cast_fp16)[name = string("op_1044_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1045_cast_fp16 = mul(x = v_5_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1045_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_5_cast_fp16 = add(x = var_1044_cast_fp16, y = var_1045_cast_fp16)[name = string("value_cache_updated_5_cast_fp16")];
+            tensor<int32, [4]> var_1047 = const()[name = string("op_1047"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_23_cast_fp16 = reshape(shape = var_1047, x = q_rotated_5_cast_fp16)[name = string("q_23_cast_fp16")];
+            tensor<int32, [4]> var_1050 = const()[name = string("op_1050"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_5_cast_fp16 = reshape(shape = var_1050, x = key_cache_updated_5_cast_fp16)[name = string("k_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1052 = const()[name = string("op_1052"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_5_cast_fp16 = reshape(shape = var_1052, x = value_cache_updated_5_cast_fp16)[name = string("v_for_attn_5_cast_fp16")];
+            bool var_1056_transpose_x_1 = const()[name = string("op_1056_transpose_x_1"), val = bool(true)];
+            bool var_1056_transpose_y_1 = const()[name = string("op_1056_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1056_cast_fp16 = matmul(transpose_x = var_1056_transpose_x_1, transpose_y = var_1056_transpose_y_1, x = q_23_cast_fp16, y = k_for_attn_5_cast_fp16)[name = string("op_1056_cast_fp16")];
+            fp16 var_1057_to_fp16 = const()[name = string("op_1057_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_9_cast_fp16 = mul(x = var_1056_cast_fp16, y = var_1057_to_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_61_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_61_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_11_cast_fp16 = softmax(axis = var_28, x = input_61_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool attn_output_9_transpose_x_1 = const()[name = string("attn_output_9_transpose_x_1"), val = bool(false)];
+            bool attn_output_9_transpose_y_1 = const()[name = string("attn_output_9_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_9_cast_fp16 = matmul(transpose_x = attn_output_9_transpose_x_1, transpose_y = attn_output_9_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_for_attn_5_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [4]> var_1066 = const()[name = string("op_1066"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1068 = const()[name = string("op_1068"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_11_cast_fp16 = transpose(perm = var_1066, x = attn_output_9_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 1024, 1, 1]> input_63_cast_fp16 = reshape(shape = var_1068, x = attn_output_11_cast_fp16)[name = string("input_63_cast_fp16")];
+            string x_17_pad_type_0 = const()[name = string("x_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_17_strides_0 = const()[name = string("x_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_17_pad_0 = const()[name = string("x_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_17_dilations_0 = const()[name = string("x_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_17_groups_0 = const()[name = string("x_17_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1083_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19731712))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20256064))))[name = string("op_1083_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1083_bias_0_to_fp16 = const()[name = string("op_1083_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20256640)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1083_cast_fp16 = conv(bias = var_1083_bias_0_to_fp16, dilations = x_17_dilations_0, groups = x_17_groups_0, pad = x_17_pad_0, pad_type = x_17_pad_type_0, strides = x_17_strides_0, weight = op_1083_weight_0_to_fp16_palettized, x = input_63_cast_fp16)[name = string("op_1083_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_11_cast_fp16 = add(x = inputs_9_cast_fp16, y = var_1083_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_1089_to_fp16 = const()[name = string("op_1089_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1090_cast_fp16 = add(x = variance_11_cast_fp16, y = var_1089_to_fp16)[name = string("op_1090_cast_fp16")];
+            fp32 var_1091_epsilon_0 = const()[name = string("op_1091_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1091_cast_fp16 = rsqrt(epsilon = var_1091_epsilon_0, x = var_1090_cast_fp16)[name = string("op_1091_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_1091_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20257728)))];
+            tensor<fp16, [1, 512, 1, 1]> input_65_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_11_cast_fp16)[name = string("input_65_cast_fp16")];
+            string input_67_pad_type_0 = const()[name = string("input_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_67_strides_0 = const()[name = string("input_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_67_pad_0 = const()[name = string("input_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_67_dilations_0 = const()[name = string("input_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_67_groups_0 = const()[name = string("input_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20258816))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20783168))))[name = string("decoder_pre_transformer_layers_2_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_67_cast_fp16 = conv(dilations = input_67_dilations_0, groups = input_67_groups_0, pad = input_67_pad_0, pad_type = input_67_pad_type_0, strides = input_67_strides_0, weight = decoder_pre_transformer_layers_2_mlp_gate_proj_weight_to_fp16_palettized, x = input_65_cast_fp16)[name = string("input_67_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1105_cast_fp16 = silu(x = input_67_cast_fp16)[name = string("op_1105_cast_fp16")];
+            string var_1111_pad_type_0 = const()[name = string("op_1111_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1111_strides_0 = const()[name = string("op_1111_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1111_pad_0 = const()[name = string("op_1111_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1111_dilations_0 = const()[name = string("op_1111_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1111_groups_0 = const()[name = string("op_1111_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20783744))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21308096))))[name = string("decoder_pre_transformer_layers_2_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1111_cast_fp16 = conv(dilations = var_1111_dilations_0, groups = var_1111_groups_0, pad = var_1111_pad_0, pad_type = var_1111_pad_type_0, strides = var_1111_strides_0, weight = decoder_pre_transformer_layers_2_mlp_up_proj_weight_to_fp16_palettized, x = input_65_cast_fp16)[name = string("op_1111_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_69_cast_fp16 = mul(x = var_1105_cast_fp16, y = var_1111_cast_fp16)[name = string("input_69_cast_fp16")];
+            string x_19_pad_type_0 = const()[name = string("x_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_19_strides_0 = const()[name = string("x_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_19_pad_0 = const()[name = string("x_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_19_dilations_0 = const()[name = string("x_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_19_groups_0 = const()[name = string("x_19_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1122_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21308672))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21833024))))[name = string("op_1122_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1122_bias_0_to_fp16 = const()[name = string("op_1122_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21833600)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1122_cast_fp16 = conv(bias = var_1122_bias_0_to_fp16, dilations = x_19_dilations_0, groups = x_19_groups_0, pad = x_19_pad_0, pad_type = x_19_pad_type_0, strides = x_19_strides_0, weight = op_1122_weight_0_to_fp16_palettized, x = input_69_cast_fp16)[name = string("op_1122_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_13_cast_fp16 = add(x = inputs_11_cast_fp16, y = var_1122_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_1138_to_fp16 = const()[name = string("op_1138_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1139_cast_fp16 = add(x = variance_13_cast_fp16, y = var_1138_to_fp16)[name = string("op_1139_cast_fp16")];
+            fp32 var_1140_epsilon_0 = const()[name = string("op_1140_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1140_cast_fp16 = rsqrt(epsilon = var_1140_epsilon_0, x = var_1139_cast_fp16)[name = string("op_1140_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_1140_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21834688)))];
+            tensor<fp16, [1, 512, 1, 1]> x_21_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_13_cast_fp16)[name = string("x_21_cast_fp16")];
+            string q_25_pad_type_0 = const()[name = string("q_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_25_strides_0 = const()[name = string("q_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_25_pad_0 = const()[name = string("q_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_25_dilations_0 = const()[name = string("q_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_25_groups_0 = const()[name = string("q_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21835776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22360128))))[name = string("decoder_pre_transformer_layers_3_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_25_cast_fp16 = conv(dilations = q_25_dilations_0, groups = q_25_groups_0, pad = q_25_pad_0, pad_type = q_25_pad_type_0, strides = q_25_strides_0, weight = decoder_pre_transformer_layers_3_self_attn_q_proj_weight_to_fp16_palettized, x = x_21_cast_fp16)[name = string("q_25_cast_fp16")];
+            string k_13_pad_type_0 = const()[name = string("k_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_13_strides_0 = const()[name = string("k_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = string("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_13_dilations_0 = const()[name = string("k_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_13_groups_0 = const()[name = string("k_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22360704))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22885056))))[name = string("decoder_pre_transformer_layers_3_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_13_cast_fp16 = conv(dilations = k_13_dilations_0, groups = k_13_groups_0, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = k_13_strides_0, weight = decoder_pre_transformer_layers_3_self_attn_k_proj_weight_to_fp16_palettized, x = x_21_cast_fp16)[name = string("k_13_cast_fp16")];
+            string v_7_pad_type_0 = const()[name = string("v_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_7_strides_0 = const()[name = string("v_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_7_pad_0 = const()[name = string("v_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_7_dilations_0 = const()[name = string("v_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_7_groups_0 = const()[name = string("v_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22885632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23409984))))[name = string("decoder_pre_transformer_layers_3_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_7_cast_fp16 = conv(dilations = v_7_dilations_0, groups = v_7_groups_0, pad = v_7_pad_0, pad_type = v_7_pad_type_0, strides = v_7_strides_0, weight = decoder_pre_transformer_layers_3_self_attn_v_proj_weight_to_fp16_palettized, x = x_21_cast_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_1172 = const()[name = string("op_1172"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_27_cast_fp16 = reshape(shape = var_1172, x = q_25_cast_fp16)[name = string("q_27_cast_fp16")];
+            tensor<int32, [4]> var_1177 = const()[name = string("op_1177"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_15_cast_fp16 = reshape(shape = var_1177, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1189_cast_fp16 = mul(x = q_27_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1189_cast_fp16")];
+            tensor<int32, [4]> var_1194_begin_0 = const()[name = string("op_1194_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1194_end_0 = const()[name = string("op_1194_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1194_end_mask_0 = const()[name = string("op_1194_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1194_cast_fp16 = slice_by_index(begin = var_1194_begin_0, end = var_1194_end_0, end_mask = var_1194_end_mask_0, x = q_27_cast_fp16)[name = string("op_1194_cast_fp16")];
+            tensor<int32, [4]> var_1201_begin_0 = const()[name = string("op_1201_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1201_end_0 = const()[name = string("op_1201_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1201_end_mask_0 = const()[name = string("op_1201_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1201_cast_fp16 = slice_by_index(begin = var_1201_begin_0, end = var_1201_end_0, end_mask = var_1201_end_mask_0, x = q_27_cast_fp16)[name = string("op_1201_cast_fp16")];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1204_cast_fp16 = mul(x = var_1201_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_1204_cast_fp16")];
+            bool var_1206_interleave_0 = const()[name = string("op_1206_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1206_cast_fp16 = concat(axis = var_32, interleave = var_1206_interleave_0, values = (var_1204_cast_fp16, var_1194_cast_fp16))[name = string("op_1206_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1207_cast_fp16 = mul(x = var_1206_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1207_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_7_cast_fp16 = add(x = var_1189_cast_fp16, y = var_1207_cast_fp16)[name = string("q_rotated_7_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1209_cast_fp16 = mul(x = k_15_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1209_cast_fp16")];
+            tensor<int32, [4]> var_1214_begin_0 = const()[name = string("op_1214_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1214_end_0 = const()[name = string("op_1214_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1214_end_mask_0 = const()[name = string("op_1214_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1214_cast_fp16 = slice_by_index(begin = var_1214_begin_0, end = var_1214_end_0, end_mask = var_1214_end_mask_0, x = k_15_cast_fp16)[name = string("op_1214_cast_fp16")];
+            tensor<int32, [4]> var_1221_begin_0 = const()[name = string("op_1221_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1221_end_0 = const()[name = string("op_1221_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1221_end_mask_0 = const()[name = string("op_1221_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1221_cast_fp16 = slice_by_index(begin = var_1221_begin_0, end = var_1221_end_0, end_mask = var_1221_end_mask_0, x = k_15_cast_fp16)[name = string("op_1221_cast_fp16")];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1224_cast_fp16 = mul(x = var_1221_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_1224_cast_fp16")];
+            bool var_1226_interleave_0 = const()[name = string("op_1226_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1226_cast_fp16 = concat(axis = var_32, interleave = var_1226_interleave_0, values = (var_1224_cast_fp16, var_1214_cast_fp16))[name = string("op_1226_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1227_cast_fp16 = mul(x = var_1226_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1227_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_7_cast_fp16 = add(x = var_1209_cast_fp16, y = var_1227_cast_fp16)[name = string("k_rotated_7_cast_fp16")];
+            tensor<int32, [4]> var_1231 = const()[name = string("op_1231"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_1231, x = k_rotated_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1236_cast_fp16 = mul(x = var_522_cast_fp16_3, y = var_647_cast_fp16)[name = string("op_1236_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1237_cast_fp16 = mul(x = current_key_7_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1237_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_7_cast_fp16 = add(x = var_1236_cast_fp16, y = var_1237_cast_fp16)[name = string("key_cache_updated_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1240_cast_fp16 = mul(x = var_531_cast_fp16_3, y = var_647_cast_fp16)[name = string("op_1240_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1241_cast_fp16 = mul(x = v_7_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1241_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_7_cast_fp16 = add(x = var_1240_cast_fp16, y = var_1241_cast_fp16)[name = string("value_cache_updated_7_cast_fp16")];
+            tensor<int32, [4]> var_1243 = const()[name = string("op_1243"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_31_cast_fp16 = reshape(shape = var_1243, x = q_rotated_7_cast_fp16)[name = string("q_31_cast_fp16")];
+            tensor<int32, [4]> var_1246 = const()[name = string("op_1246"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_7_cast_fp16 = reshape(shape = var_1246, x = key_cache_updated_7_cast_fp16)[name = string("k_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1248 = const()[name = string("op_1248"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_7_cast_fp16 = reshape(shape = var_1248, x = value_cache_updated_7_cast_fp16)[name = string("v_for_attn_7_cast_fp16")];
+            bool var_1252_transpose_x_1 = const()[name = string("op_1252_transpose_x_1"), val = bool(true)];
+            bool var_1252_transpose_y_1 = const()[name = string("op_1252_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1252_cast_fp16 = matmul(transpose_x = var_1252_transpose_x_1, transpose_y = var_1252_transpose_y_1, x = q_31_cast_fp16, y = k_for_attn_7_cast_fp16)[name = string("op_1252_cast_fp16")];
+            fp16 var_1253_to_fp16 = const()[name = string("op_1253_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_13_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_71_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_71_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_15_cast_fp16 = softmax(axis = var_28, x = input_71_cast_fp16)[name = string("attn_weights_15_cast_fp16")];
+            bool attn_output_13_transpose_x_1 = const()[name = string("attn_output_13_transpose_x_1"), val = bool(false)];
+            bool attn_output_13_transpose_y_1 = const()[name = string("attn_output_13_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_1, transpose_y = attn_output_13_transpose_y_1, x = attn_weights_15_cast_fp16, y = v_for_attn_7_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1262 = const()[name = string("op_1262"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1264 = const()[name = string("op_1264"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_15_cast_fp16 = transpose(perm = var_1262, x = attn_output_13_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 1024, 1, 1]> input_73_cast_fp16 = reshape(shape = var_1264, x = attn_output_15_cast_fp16)[name = string("input_73_cast_fp16")];
+            string x_23_pad_type_0 = const()[name = string("x_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_23_strides_0 = const()[name = string("x_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_23_pad_0 = const()[name = string("x_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_23_dilations_0 = const()[name = string("x_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_23_groups_0 = const()[name = string("x_23_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1279_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23410560))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23934912))))[name = string("op_1279_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1279_bias_0_to_fp16 = const()[name = string("op_1279_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23935488)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1279_cast_fp16 = conv(bias = var_1279_bias_0_to_fp16, dilations = x_23_dilations_0, groups = x_23_groups_0, pad = x_23_pad_0, pad_type = x_23_pad_type_0, strides = x_23_strides_0, weight = op_1279_weight_0_to_fp16_palettized, x = input_73_cast_fp16)[name = string("op_1279_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = var_1279_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_1285_to_fp16 = const()[name = string("op_1285_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1286_cast_fp16 = add(x = variance_15_cast_fp16, y = var_1285_to_fp16)[name = string("op_1286_cast_fp16")];
+            fp32 var_1287_epsilon_0 = const()[name = string("op_1287_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1287_cast_fp16 = rsqrt(epsilon = var_1287_epsilon_0, x = var_1286_cast_fp16)[name = string("op_1287_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_1287_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23936576)))];
+            tensor<fp16, [1, 512, 1, 1]> input_75_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_15_cast_fp16)[name = string("input_75_cast_fp16")];
+            string input_77_pad_type_0 = const()[name = string("input_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_77_strides_0 = const()[name = string("input_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_77_pad_0 = const()[name = string("input_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_77_dilations_0 = const()[name = string("input_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_77_groups_0 = const()[name = string("input_77_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23937664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24462016))))[name = string("decoder_pre_transformer_layers_3_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_77_cast_fp16 = conv(dilations = input_77_dilations_0, groups = input_77_groups_0, pad = input_77_pad_0, pad_type = input_77_pad_type_0, strides = input_77_strides_0, weight = decoder_pre_transformer_layers_3_mlp_gate_proj_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = string("input_77_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1301_cast_fp16 = silu(x = input_77_cast_fp16)[name = string("op_1301_cast_fp16")];
+            string var_1307_pad_type_0 = const()[name = string("op_1307_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1307_strides_0 = const()[name = string("op_1307_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1307_pad_0 = const()[name = string("op_1307_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1307_dilations_0 = const()[name = string("op_1307_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1307_groups_0 = const()[name = string("op_1307_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24462592))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24986944))))[name = string("decoder_pre_transformer_layers_3_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1307_cast_fp16 = conv(dilations = var_1307_dilations_0, groups = var_1307_groups_0, pad = var_1307_pad_0, pad_type = var_1307_pad_type_0, strides = var_1307_strides_0, weight = decoder_pre_transformer_layers_3_mlp_up_proj_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = string("op_1307_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_79_cast_fp16 = mul(x = var_1301_cast_fp16, y = var_1307_cast_fp16)[name = string("input_79_cast_fp16")];
+            string x_25_pad_type_0 = const()[name = string("x_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_25_strides_0 = const()[name = string("x_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_25_pad_0 = const()[name = string("x_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_25_dilations_0 = const()[name = string("x_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_25_groups_0 = const()[name = string("x_25_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1318_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24987520))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25511872))))[name = string("op_1318_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1318_bias_0_to_fp16 = const()[name = string("op_1318_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25512448)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1318_cast_fp16 = conv(bias = var_1318_bias_0_to_fp16, dilations = x_25_dilations_0, groups = x_25_groups_0, pad = x_25_pad_0, pad_type = x_25_pad_type_0, strides = x_25_strides_0, weight = op_1318_weight_0_to_fp16_palettized, x = input_79_cast_fp16)[name = string("op_1318_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_17_cast_fp16 = add(x = inputs_15_cast_fp16, y = var_1318_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_1334_to_fp16 = const()[name = string("op_1334_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1335_cast_fp16 = add(x = variance_17_cast_fp16, y = var_1334_to_fp16)[name = string("op_1335_cast_fp16")];
+            fp32 var_1336_epsilon_0 = const()[name = string("op_1336_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1336_cast_fp16 = rsqrt(epsilon = var_1336_epsilon_0, x = var_1335_cast_fp16)[name = string("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_1336_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25513536)))];
+            tensor<fp16, [1, 512, 1, 1]> x_27_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_17_cast_fp16)[name = string("x_27_cast_fp16")];
+            string q_33_pad_type_0 = const()[name = string("q_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_33_strides_0 = const()[name = string("q_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_33_pad_0 = const()[name = string("q_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_33_dilations_0 = const()[name = string("q_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_33_groups_0 = const()[name = string("q_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25514624))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26038976))))[name = string("decoder_pre_transformer_layers_4_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_33_cast_fp16 = conv(dilations = q_33_dilations_0, groups = q_33_groups_0, pad = q_33_pad_0, pad_type = q_33_pad_type_0, strides = q_33_strides_0, weight = decoder_pre_transformer_layers_4_self_attn_q_proj_weight_to_fp16_palettized, x = x_27_cast_fp16)[name = string("q_33_cast_fp16")];
+            string k_17_pad_type_0 = const()[name = string("k_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_17_strides_0 = const()[name = string("k_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_17_pad_0 = const()[name = string("k_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_17_dilations_0 = const()[name = string("k_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_17_groups_0 = const()[name = string("k_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26039552))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26563904))))[name = string("decoder_pre_transformer_layers_4_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_17_cast_fp16 = conv(dilations = k_17_dilations_0, groups = k_17_groups_0, pad = k_17_pad_0, pad_type = k_17_pad_type_0, strides = k_17_strides_0, weight = decoder_pre_transformer_layers_4_self_attn_k_proj_weight_to_fp16_palettized, x = x_27_cast_fp16)[name = string("k_17_cast_fp16")];
+            string v_9_pad_type_0 = const()[name = string("v_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_9_strides_0 = const()[name = string("v_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_9_pad_0 = const()[name = string("v_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_9_dilations_0 = const()[name = string("v_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_9_groups_0 = const()[name = string("v_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26564480))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27088832))))[name = string("decoder_pre_transformer_layers_4_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_9_cast_fp16 = conv(dilations = v_9_dilations_0, groups = v_9_groups_0, pad = v_9_pad_0, pad_type = v_9_pad_type_0, strides = v_9_strides_0, weight = decoder_pre_transformer_layers_4_self_attn_v_proj_weight_to_fp16_palettized, x = x_27_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_1368 = const()[name = string("op_1368"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_35_cast_fp16 = reshape(shape = var_1368, x = q_33_cast_fp16)[name = string("q_35_cast_fp16")];
+            tensor<int32, [4]> var_1373 = const()[name = string("op_1373"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_19_cast_fp16 = reshape(shape = var_1373, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1385_cast_fp16 = mul(x = q_35_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1385_cast_fp16")];
+            tensor<int32, [4]> var_1390_begin_0 = const()[name = string("op_1390_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1390_end_0 = const()[name = string("op_1390_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1390_end_mask_0 = const()[name = string("op_1390_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1390_cast_fp16 = slice_by_index(begin = var_1390_begin_0, end = var_1390_end_0, end_mask = var_1390_end_mask_0, x = q_35_cast_fp16)[name = string("op_1390_cast_fp16")];
+            tensor<int32, [4]> var_1397_begin_0 = const()[name = string("op_1397_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1397_end_0 = const()[name = string("op_1397_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1397_end_mask_0 = const()[name = string("op_1397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1397_cast_fp16 = slice_by_index(begin = var_1397_begin_0, end = var_1397_end_0, end_mask = var_1397_end_mask_0, x = q_35_cast_fp16)[name = string("op_1397_cast_fp16")];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1400_cast_fp16 = mul(x = var_1397_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_1400_cast_fp16")];
+            bool var_1402_interleave_0 = const()[name = string("op_1402_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1402_cast_fp16 = concat(axis = var_32, interleave = var_1402_interleave_0, values = (var_1400_cast_fp16, var_1390_cast_fp16))[name = string("op_1402_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1403_cast_fp16 = mul(x = var_1402_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1403_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_9_cast_fp16 = add(x = var_1385_cast_fp16, y = var_1403_cast_fp16)[name = string("q_rotated_9_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1405_cast_fp16 = mul(x = k_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1405_cast_fp16")];
+            tensor<int32, [4]> var_1410_begin_0 = const()[name = string("op_1410_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1410_end_0 = const()[name = string("op_1410_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1410_end_mask_0 = const()[name = string("op_1410_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1410_cast_fp16 = slice_by_index(begin = var_1410_begin_0, end = var_1410_end_0, end_mask = var_1410_end_mask_0, x = k_19_cast_fp16)[name = string("op_1410_cast_fp16")];
+            tensor<int32, [4]> var_1417_begin_0 = const()[name = string("op_1417_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1417_end_0 = const()[name = string("op_1417_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1417_end_mask_0 = const()[name = string("op_1417_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1417_cast_fp16 = slice_by_index(begin = var_1417_begin_0, end = var_1417_end_0, end_mask = var_1417_end_mask_0, x = k_19_cast_fp16)[name = string("op_1417_cast_fp16")];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1420_cast_fp16 = mul(x = var_1417_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_1420_cast_fp16")];
+            bool var_1422_interleave_0 = const()[name = string("op_1422_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1422_cast_fp16 = concat(axis = var_32, interleave = var_1422_interleave_0, values = (var_1420_cast_fp16, var_1410_cast_fp16))[name = string("op_1422_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1423_cast_fp16 = mul(x = var_1422_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1423_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_9_cast_fp16 = add(x = var_1405_cast_fp16, y = var_1423_cast_fp16)[name = string("k_rotated_9_cast_fp16")];
+            tensor<int32, [4]> var_1427 = const()[name = string("op_1427"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = reshape(shape = var_1427, x = k_rotated_9_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1432_cast_fp16 = mul(x = var_522_cast_fp16_4, y = var_647_cast_fp16)[name = string("op_1432_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1433_cast_fp16 = mul(x = current_key_9_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1433_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_9_cast_fp16 = add(x = var_1432_cast_fp16, y = var_1433_cast_fp16)[name = string("key_cache_updated_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1436_cast_fp16 = mul(x = var_531_cast_fp16_4, y = var_647_cast_fp16)[name = string("op_1436_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1437_cast_fp16 = mul(x = v_9_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1437_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_9_cast_fp16 = add(x = var_1436_cast_fp16, y = var_1437_cast_fp16)[name = string("value_cache_updated_9_cast_fp16")];
+            tensor<int32, [4]> var_1439 = const()[name = string("op_1439"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_39_cast_fp16 = reshape(shape = var_1439, x = q_rotated_9_cast_fp16)[name = string("q_39_cast_fp16")];
+            tensor<int32, [4]> var_1442 = const()[name = string("op_1442"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_9_cast_fp16 = reshape(shape = var_1442, x = key_cache_updated_9_cast_fp16)[name = string("k_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> var_1444 = const()[name = string("op_1444"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_9_cast_fp16 = reshape(shape = var_1444, x = value_cache_updated_9_cast_fp16)[name = string("v_for_attn_9_cast_fp16")];
+            bool var_1448_transpose_x_1 = const()[name = string("op_1448_transpose_x_1"), val = bool(true)];
+            bool var_1448_transpose_y_1 = const()[name = string("op_1448_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1448_cast_fp16 = matmul(transpose_x = var_1448_transpose_x_1, transpose_y = var_1448_transpose_y_1, x = q_39_cast_fp16, y = k_for_attn_9_cast_fp16)[name = string("op_1448_cast_fp16")];
+            fp16 var_1449_to_fp16 = const()[name = string("op_1449_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_17_cast_fp16 = mul(x = var_1448_cast_fp16, y = var_1449_to_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_81_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_81_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_19_cast_fp16 = softmax(axis = var_28, x = input_81_cast_fp16)[name = string("attn_weights_19_cast_fp16")];
+            bool attn_output_17_transpose_x_1 = const()[name = string("attn_output_17_transpose_x_1"), val = bool(false)];
+            bool attn_output_17_transpose_y_1 = const()[name = string("attn_output_17_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_17_cast_fp16 = matmul(transpose_x = attn_output_17_transpose_x_1, transpose_y = attn_output_17_transpose_y_1, x = attn_weights_19_cast_fp16, y = v_for_attn_9_cast_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<int32, [4]> var_1458 = const()[name = string("op_1458"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1460 = const()[name = string("op_1460"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_19_cast_fp16 = transpose(perm = var_1458, x = attn_output_17_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1024, 1, 1]> input_83_cast_fp16 = reshape(shape = var_1460, x = attn_output_19_cast_fp16)[name = string("input_83_cast_fp16")];
+            string x_29_pad_type_0 = const()[name = string("x_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_29_strides_0 = const()[name = string("x_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_29_pad_0 = const()[name = string("x_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_29_dilations_0 = const()[name = string("x_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_29_groups_0 = const()[name = string("x_29_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1475_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27089408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27613760))))[name = string("op_1475_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1475_bias_0_to_fp16 = const()[name = string("op_1475_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27614336)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1475_cast_fp16 = conv(bias = var_1475_bias_0_to_fp16, dilations = x_29_dilations_0, groups = x_29_groups_0, pad = x_29_pad_0, pad_type = x_29_pad_type_0, strides = x_29_strides_0, weight = op_1475_weight_0_to_fp16_palettized, x = input_83_cast_fp16)[name = string("op_1475_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_19_cast_fp16 = add(x = inputs_17_cast_fp16, y = var_1475_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_1481_to_fp16 = const()[name = string("op_1481_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1482_cast_fp16 = add(x = variance_19_cast_fp16, y = var_1481_to_fp16)[name = string("op_1482_cast_fp16")];
+            fp32 var_1483_epsilon_0 = const()[name = string("op_1483_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1483_cast_fp16 = rsqrt(epsilon = var_1483_epsilon_0, x = var_1482_cast_fp16)[name = string("op_1483_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_19_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_1483_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27615424)))];
+            tensor<fp16, [1, 512, 1, 1]> input_85_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_19_cast_fp16)[name = string("input_85_cast_fp16")];
+            string input_87_pad_type_0 = const()[name = string("input_87_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_87_strides_0 = const()[name = string("input_87_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_87_pad_0 = const()[name = string("input_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_87_dilations_0 = const()[name = string("input_87_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_87_groups_0 = const()[name = string("input_87_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27616512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28140864))))[name = string("decoder_pre_transformer_layers_4_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_87_cast_fp16 = conv(dilations = input_87_dilations_0, groups = input_87_groups_0, pad = input_87_pad_0, pad_type = input_87_pad_type_0, strides = input_87_strides_0, weight = decoder_pre_transformer_layers_4_mlp_gate_proj_weight_to_fp16_palettized, x = input_85_cast_fp16)[name = string("input_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1497_cast_fp16 = silu(x = input_87_cast_fp16)[name = string("op_1497_cast_fp16")];
+            string var_1503_pad_type_0 = const()[name = string("op_1503_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1503_strides_0 = const()[name = string("op_1503_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1503_pad_0 = const()[name = string("op_1503_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1503_dilations_0 = const()[name = string("op_1503_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1503_groups_0 = const()[name = string("op_1503_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28141440))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28665792))))[name = string("decoder_pre_transformer_layers_4_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1503_cast_fp16 = conv(dilations = var_1503_dilations_0, groups = var_1503_groups_0, pad = var_1503_pad_0, pad_type = var_1503_pad_type_0, strides = var_1503_strides_0, weight = decoder_pre_transformer_layers_4_mlp_up_proj_weight_to_fp16_palettized, x = input_85_cast_fp16)[name = string("op_1503_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_89_cast_fp16 = mul(x = var_1497_cast_fp16, y = var_1503_cast_fp16)[name = string("input_89_cast_fp16")];
+            string x_31_pad_type_0 = const()[name = string("x_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_31_strides_0 = const()[name = string("x_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_31_pad_0 = const()[name = string("x_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_31_dilations_0 = const()[name = string("x_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_31_groups_0 = const()[name = string("x_31_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1514_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28666368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29190720))))[name = string("op_1514_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1514_bias_0_to_fp16 = const()[name = string("op_1514_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29191296)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1514_cast_fp16 = conv(bias = var_1514_bias_0_to_fp16, dilations = x_31_dilations_0, groups = x_31_groups_0, pad = x_31_pad_0, pad_type = x_31_pad_type_0, strides = x_31_strides_0, weight = op_1514_weight_0_to_fp16_palettized, x = input_89_cast_fp16)[name = string("op_1514_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_21_cast_fp16 = add(x = inputs_19_cast_fp16, y = var_1514_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_1530_to_fp16 = const()[name = string("op_1530_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1531_cast_fp16 = add(x = variance_21_cast_fp16, y = var_1530_to_fp16)[name = string("op_1531_cast_fp16")];
+            fp32 var_1532_epsilon_0 = const()[name = string("op_1532_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1532_cast_fp16 = rsqrt(epsilon = var_1532_epsilon_0, x = var_1531_cast_fp16)[name = string("op_1532_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_1532_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29192384)))];
+            tensor<fp16, [1, 512, 1, 1]> x_33_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_21_cast_fp16)[name = string("x_33_cast_fp16")];
+            string q_41_pad_type_0 = const()[name = string("q_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_41_strides_0 = const()[name = string("q_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_41_pad_0 = const()[name = string("q_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_41_dilations_0 = const()[name = string("q_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_41_groups_0 = const()[name = string("q_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29193472))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29717824))))[name = string("decoder_pre_transformer_layers_5_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_41_cast_fp16 = conv(dilations = q_41_dilations_0, groups = q_41_groups_0, pad = q_41_pad_0, pad_type = q_41_pad_type_0, strides = q_41_strides_0, weight = decoder_pre_transformer_layers_5_self_attn_q_proj_weight_to_fp16_palettized, x = x_33_cast_fp16)[name = string("q_41_cast_fp16")];
+            string k_21_pad_type_0 = const()[name = string("k_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_21_strides_0 = const()[name = string("k_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_21_pad_0 = const()[name = string("k_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_21_dilations_0 = const()[name = string("k_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_21_groups_0 = const()[name = string("k_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29718400))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30242752))))[name = string("decoder_pre_transformer_layers_5_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_21_cast_fp16 = conv(dilations = k_21_dilations_0, groups = k_21_groups_0, pad = k_21_pad_0, pad_type = k_21_pad_type_0, strides = k_21_strides_0, weight = decoder_pre_transformer_layers_5_self_attn_k_proj_weight_to_fp16_palettized, x = x_33_cast_fp16)[name = string("k_21_cast_fp16")];
+            string v_11_pad_type_0 = const()[name = string("v_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_11_strides_0 = const()[name = string("v_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = string("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_11_dilations_0 = const()[name = string("v_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_11_groups_0 = const()[name = string("v_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30243328))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30767680))))[name = string("decoder_pre_transformer_layers_5_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_11_cast_fp16 = conv(dilations = v_11_dilations_0, groups = v_11_groups_0, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = v_11_strides_0, weight = decoder_pre_transformer_layers_5_self_attn_v_proj_weight_to_fp16_palettized, x = x_33_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_1564 = const()[name = string("op_1564"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_43_cast_fp16 = reshape(shape = var_1564, x = q_41_cast_fp16)[name = string("q_43_cast_fp16")];
+            tensor<int32, [4]> var_1569 = const()[name = string("op_1569"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_23_cast_fp16 = reshape(shape = var_1569, x = k_21_cast_fp16)[name = string("k_23_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1581_cast_fp16 = mul(x = q_43_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1581_cast_fp16")];
+            tensor<int32, [4]> var_1586_begin_0 = const()[name = string("op_1586_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1586_end_0 = const()[name = string("op_1586_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1586_end_mask_0 = const()[name = string("op_1586_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1586_cast_fp16 = slice_by_index(begin = var_1586_begin_0, end = var_1586_end_0, end_mask = var_1586_end_mask_0, x = q_43_cast_fp16)[name = string("op_1586_cast_fp16")];
+            tensor<int32, [4]> var_1593_begin_0 = const()[name = string("op_1593_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1593_end_0 = const()[name = string("op_1593_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1593_end_mask_0 = const()[name = string("op_1593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1593_cast_fp16 = slice_by_index(begin = var_1593_begin_0, end = var_1593_end_0, end_mask = var_1593_end_mask_0, x = q_43_cast_fp16)[name = string("op_1593_cast_fp16")];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1596_cast_fp16 = mul(x = var_1593_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_1596_cast_fp16")];
+            bool var_1598_interleave_0 = const()[name = string("op_1598_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1598_cast_fp16 = concat(axis = var_32, interleave = var_1598_interleave_0, values = (var_1596_cast_fp16, var_1586_cast_fp16))[name = string("op_1598_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1599_cast_fp16 = mul(x = var_1598_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1599_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_11_cast_fp16 = add(x = var_1581_cast_fp16, y = var_1599_cast_fp16)[name = string("q_rotated_11_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1601_cast_fp16 = mul(x = k_23_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1601_cast_fp16")];
+            tensor<int32, [4]> var_1606_begin_0 = const()[name = string("op_1606_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1606_end_0 = const()[name = string("op_1606_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1606_end_mask_0 = const()[name = string("op_1606_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1606_cast_fp16 = slice_by_index(begin = var_1606_begin_0, end = var_1606_end_0, end_mask = var_1606_end_mask_0, x = k_23_cast_fp16)[name = string("op_1606_cast_fp16")];
+            tensor<int32, [4]> var_1613_begin_0 = const()[name = string("op_1613_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1613_end_0 = const()[name = string("op_1613_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1613_end_mask_0 = const()[name = string("op_1613_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1613_cast_fp16 = slice_by_index(begin = var_1613_begin_0, end = var_1613_end_0, end_mask = var_1613_end_mask_0, x = k_23_cast_fp16)[name = string("op_1613_cast_fp16")];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1616_cast_fp16 = mul(x = var_1613_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_1616_cast_fp16")];
+            bool var_1618_interleave_0 = const()[name = string("op_1618_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1618_cast_fp16 = concat(axis = var_32, interleave = var_1618_interleave_0, values = (var_1616_cast_fp16, var_1606_cast_fp16))[name = string("op_1618_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1619_cast_fp16 = mul(x = var_1618_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1619_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_11_cast_fp16 = add(x = var_1601_cast_fp16, y = var_1619_cast_fp16)[name = string("k_rotated_11_cast_fp16")];
+            tensor<int32, [4]> var_1623 = const()[name = string("op_1623"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1623, x = k_rotated_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1628_cast_fp16 = mul(x = var_522_cast_fp16_5, y = var_647_cast_fp16)[name = string("op_1628_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1629_cast_fp16 = mul(x = current_key_11_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1629_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_11_cast_fp16 = add(x = var_1628_cast_fp16, y = var_1629_cast_fp16)[name = string("key_cache_updated_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1632_cast_fp16 = mul(x = var_531_cast_fp16_5, y = var_647_cast_fp16)[name = string("op_1632_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1633_cast_fp16 = mul(x = v_11_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1633_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_11_cast_fp16 = add(x = var_1632_cast_fp16, y = var_1633_cast_fp16)[name = string("value_cache_updated_11_cast_fp16")];
+            tensor<int32, [4]> var_1635 = const()[name = string("op_1635"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_47_cast_fp16 = reshape(shape = var_1635, x = q_rotated_11_cast_fp16)[name = string("q_47_cast_fp16")];
+            tensor<int32, [4]> var_1638 = const()[name = string("op_1638"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_11_cast_fp16 = reshape(shape = var_1638, x = key_cache_updated_11_cast_fp16)[name = string("k_for_attn_11_cast_fp16")];
+            tensor<int32, [4]> var_1640 = const()[name = string("op_1640"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_11_cast_fp16 = reshape(shape = var_1640, x = value_cache_updated_11_cast_fp16)[name = string("v_for_attn_11_cast_fp16")];
+            bool var_1644_transpose_x_1 = const()[name = string("op_1644_transpose_x_1"), val = bool(true)];
+            bool var_1644_transpose_y_1 = const()[name = string("op_1644_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1644_cast_fp16 = matmul(transpose_x = var_1644_transpose_x_1, transpose_y = var_1644_transpose_y_1, x = q_47_cast_fp16, y = k_for_attn_11_cast_fp16)[name = string("op_1644_cast_fp16")];
+            fp16 var_1645_to_fp16 = const()[name = string("op_1645_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_21_cast_fp16 = mul(x = var_1644_cast_fp16, y = var_1645_to_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_91_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_91_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_23_cast_fp16 = softmax(axis = var_28, x = input_91_cast_fp16)[name = string("attn_weights_23_cast_fp16")];
+            bool attn_output_21_transpose_x_1 = const()[name = string("attn_output_21_transpose_x_1"), val = bool(false)];
+            bool attn_output_21_transpose_y_1 = const()[name = string("attn_output_21_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_21_cast_fp16 = matmul(transpose_x = attn_output_21_transpose_x_1, transpose_y = attn_output_21_transpose_y_1, x = attn_weights_23_cast_fp16, y = v_for_attn_11_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [4]> var_1654 = const()[name = string("op_1654"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1656 = const()[name = string("op_1656"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_23_cast_fp16 = transpose(perm = var_1654, x = attn_output_21_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 1024, 1, 1]> input_93_cast_fp16 = reshape(shape = var_1656, x = attn_output_23_cast_fp16)[name = string("input_93_cast_fp16")];
+            string x_35_pad_type_0 = const()[name = string("x_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_35_strides_0 = const()[name = string("x_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_35_pad_0 = const()[name = string("x_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_35_dilations_0 = const()[name = string("x_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_35_groups_0 = const()[name = string("x_35_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1671_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30768256))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31292608))))[name = string("op_1671_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1671_bias_0_to_fp16 = const()[name = string("op_1671_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31293184)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1671_cast_fp16 = conv(bias = var_1671_bias_0_to_fp16, dilations = x_35_dilations_0, groups = x_35_groups_0, pad = x_35_pad_0, pad_type = x_35_pad_type_0, strides = x_35_strides_0, weight = op_1671_weight_0_to_fp16_palettized, x = input_93_cast_fp16)[name = string("op_1671_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_21_cast_fp16, y = var_1671_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1677_to_fp16 = const()[name = string("op_1677_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1678_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1677_to_fp16)[name = string("op_1678_cast_fp16")];
+            fp32 var_1679_epsilon_0 = const()[name = string("op_1679_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1679_cast_fp16 = rsqrt(epsilon = var_1679_epsilon_0, x = var_1678_cast_fp16)[name = string("op_1679_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1679_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31294272)))];
+            tensor<fp16, [1, 512, 1, 1]> input_95_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_23_cast_fp16)[name = string("input_95_cast_fp16")];
+            string input_97_pad_type_0 = const()[name = string("input_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_97_strides_0 = const()[name = string("input_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_97_pad_0 = const()[name = string("input_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_97_dilations_0 = const()[name = string("input_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_97_groups_0 = const()[name = string("input_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31295360))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31819712))))[name = string("decoder_pre_transformer_layers_5_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_97_cast_fp16 = conv(dilations = input_97_dilations_0, groups = input_97_groups_0, pad = input_97_pad_0, pad_type = input_97_pad_type_0, strides = input_97_strides_0, weight = decoder_pre_transformer_layers_5_mlp_gate_proj_weight_to_fp16_palettized, x = input_95_cast_fp16)[name = string("input_97_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1693_cast_fp16 = silu(x = input_97_cast_fp16)[name = string("op_1693_cast_fp16")];
+            string var_1699_pad_type_0 = const()[name = string("op_1699_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1699_strides_0 = const()[name = string("op_1699_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1699_pad_0 = const()[name = string("op_1699_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1699_dilations_0 = const()[name = string("op_1699_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1699_groups_0 = const()[name = string("op_1699_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31820288))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32344640))))[name = string("decoder_pre_transformer_layers_5_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1699_cast_fp16 = conv(dilations = var_1699_dilations_0, groups = var_1699_groups_0, pad = var_1699_pad_0, pad_type = var_1699_pad_type_0, strides = var_1699_strides_0, weight = decoder_pre_transformer_layers_5_mlp_up_proj_weight_to_fp16_palettized, x = input_95_cast_fp16)[name = string("op_1699_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_99_cast_fp16 = mul(x = var_1693_cast_fp16, y = var_1699_cast_fp16)[name = string("input_99_cast_fp16")];
+            string x_37_pad_type_0 = const()[name = string("x_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_37_strides_0 = const()[name = string("x_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_37_pad_0 = const()[name = string("x_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_37_dilations_0 = const()[name = string("x_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_37_groups_0 = const()[name = string("x_37_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1710_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32345216))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32869568))))[name = string("op_1710_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1710_bias_0_to_fp16 = const()[name = string("op_1710_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32870144)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1710_cast_fp16 = conv(bias = var_1710_bias_0_to_fp16, dilations = x_37_dilations_0, groups = x_37_groups_0, pad = x_37_pad_0, pad_type = x_37_pad_type_0, strides = x_37_strides_0, weight = op_1710_weight_0_to_fp16_palettized, x = input_99_cast_fp16)[name = string("op_1710_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_25_cast_fp16 = add(x = inputs_23_cast_fp16, y = var_1710_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1726_to_fp16 = const()[name = string("op_1726_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1727_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1726_to_fp16)[name = string("op_1727_cast_fp16")];
+            fp32 var_1728_epsilon_0 = const()[name = string("op_1728_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1728_cast_fp16 = rsqrt(epsilon = var_1728_epsilon_0, x = var_1727_cast_fp16)[name = string("op_1728_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1728_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32871232)))];
+            tensor<fp16, [1, 512, 1, 1]> x_39_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_25_cast_fp16)[name = string("x_39_cast_fp16")];
+            string q_49_pad_type_0 = const()[name = string("q_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_49_strides_0 = const()[name = string("q_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_49_pad_0 = const()[name = string("q_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_49_dilations_0 = const()[name = string("q_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_49_groups_0 = const()[name = string("q_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32872320))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33396672))))[name = string("decoder_pre_transformer_layers_6_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_49_cast_fp16 = conv(dilations = q_49_dilations_0, groups = q_49_groups_0, pad = q_49_pad_0, pad_type = q_49_pad_type_0, strides = q_49_strides_0, weight = decoder_pre_transformer_layers_6_self_attn_q_proj_weight_to_fp16_palettized, x = x_39_cast_fp16)[name = string("q_49_cast_fp16")];
+            string k_25_pad_type_0 = const()[name = string("k_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_25_strides_0 = const()[name = string("k_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_25_pad_0 = const()[name = string("k_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_25_dilations_0 = const()[name = string("k_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_25_groups_0 = const()[name = string("k_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33397248))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33921600))))[name = string("decoder_pre_transformer_layers_6_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_25_cast_fp16 = conv(dilations = k_25_dilations_0, groups = k_25_groups_0, pad = k_25_pad_0, pad_type = k_25_pad_type_0, strides = k_25_strides_0, weight = decoder_pre_transformer_layers_6_self_attn_k_proj_weight_to_fp16_palettized, x = x_39_cast_fp16)[name = string("k_25_cast_fp16")];
+            string v_13_pad_type_0 = const()[name = string("v_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_13_strides_0 = const()[name = string("v_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_13_pad_0 = const()[name = string("v_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_13_dilations_0 = const()[name = string("v_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_13_groups_0 = const()[name = string("v_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33922176))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34446528))))[name = string("decoder_pre_transformer_layers_6_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_13_cast_fp16 = conv(dilations = v_13_dilations_0, groups = v_13_groups_0, pad = v_13_pad_0, pad_type = v_13_pad_type_0, strides = v_13_strides_0, weight = decoder_pre_transformer_layers_6_self_attn_v_proj_weight_to_fp16_palettized, x = x_39_cast_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_1760 = const()[name = string("op_1760"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_51_cast_fp16 = reshape(shape = var_1760, x = q_49_cast_fp16)[name = string("q_51_cast_fp16")];
+            tensor<int32, [4]> var_1765 = const()[name = string("op_1765"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_27_cast_fp16 = reshape(shape = var_1765, x = k_25_cast_fp16)[name = string("k_27_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1777_cast_fp16 = mul(x = q_51_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1777_cast_fp16")];
+            tensor<int32, [4]> var_1782_begin_0 = const()[name = string("op_1782_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1782_end_0 = const()[name = string("op_1782_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1782_end_mask_0 = const()[name = string("op_1782_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1782_cast_fp16 = slice_by_index(begin = var_1782_begin_0, end = var_1782_end_0, end_mask = var_1782_end_mask_0, x = q_51_cast_fp16)[name = string("op_1782_cast_fp16")];
+            tensor<int32, [4]> var_1789_begin_0 = const()[name = string("op_1789_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1789_end_0 = const()[name = string("op_1789_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1789_end_mask_0 = const()[name = string("op_1789_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1789_cast_fp16 = slice_by_index(begin = var_1789_begin_0, end = var_1789_end_0, end_mask = var_1789_end_mask_0, x = q_51_cast_fp16)[name = string("op_1789_cast_fp16")];
+            fp16 const_72_promoted_to_fp16 = const()[name = string("const_72_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1792_cast_fp16 = mul(x = var_1789_cast_fp16, y = const_72_promoted_to_fp16)[name = string("op_1792_cast_fp16")];
+            bool var_1794_interleave_0 = const()[name = string("op_1794_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1794_cast_fp16 = concat(axis = var_32, interleave = var_1794_interleave_0, values = (var_1792_cast_fp16, var_1782_cast_fp16))[name = string("op_1794_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1795_cast_fp16 = mul(x = var_1794_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1795_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_13_cast_fp16 = add(x = var_1777_cast_fp16, y = var_1795_cast_fp16)[name = string("q_rotated_13_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1797_cast_fp16 = mul(x = k_27_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1797_cast_fp16")];
+            tensor<int32, [4]> var_1802_begin_0 = const()[name = string("op_1802_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1802_end_0 = const()[name = string("op_1802_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1802_end_mask_0 = const()[name = string("op_1802_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1802_cast_fp16 = slice_by_index(begin = var_1802_begin_0, end = var_1802_end_0, end_mask = var_1802_end_mask_0, x = k_27_cast_fp16)[name = string("op_1802_cast_fp16")];
+            tensor<int32, [4]> var_1809_begin_0 = const()[name = string("op_1809_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1809_end_0 = const()[name = string("op_1809_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1809_end_mask_0 = const()[name = string("op_1809_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1809_cast_fp16 = slice_by_index(begin = var_1809_begin_0, end = var_1809_end_0, end_mask = var_1809_end_mask_0, x = k_27_cast_fp16)[name = string("op_1809_cast_fp16")];
+            fp16 const_75_promoted_to_fp16 = const()[name = string("const_75_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1812_cast_fp16 = mul(x = var_1809_cast_fp16, y = const_75_promoted_to_fp16)[name = string("op_1812_cast_fp16")];
+            bool var_1814_interleave_0 = const()[name = string("op_1814_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1814_cast_fp16 = concat(axis = var_32, interleave = var_1814_interleave_0, values = (var_1812_cast_fp16, var_1802_cast_fp16))[name = string("op_1814_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1815_cast_fp16 = mul(x = var_1814_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1815_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_13_cast_fp16 = add(x = var_1797_cast_fp16, y = var_1815_cast_fp16)[name = string("k_rotated_13_cast_fp16")];
+            tensor<int32, [4]> var_1819 = const()[name = string("op_1819"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = reshape(shape = var_1819, x = k_rotated_13_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1824_cast_fp16 = mul(x = var_522_cast_fp16_6, y = var_647_cast_fp16)[name = string("op_1824_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1825_cast_fp16 = mul(x = current_key_13_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1825_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_13_cast_fp16 = add(x = var_1824_cast_fp16, y = var_1825_cast_fp16)[name = string("key_cache_updated_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1828_cast_fp16 = mul(x = var_531_cast_fp16_6, y = var_647_cast_fp16)[name = string("op_1828_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1829_cast_fp16 = mul(x = v_13_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1829_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_13_cast_fp16 = add(x = var_1828_cast_fp16, y = var_1829_cast_fp16)[name = string("value_cache_updated_13_cast_fp16")];
+            tensor<int32, [4]> var_1831 = const()[name = string("op_1831"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_55_cast_fp16 = reshape(shape = var_1831, x = q_rotated_13_cast_fp16)[name = string("q_55_cast_fp16")];
+            tensor<int32, [4]> var_1834 = const()[name = string("op_1834"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_13_cast_fp16 = reshape(shape = var_1834, x = key_cache_updated_13_cast_fp16)[name = string("k_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> var_1836 = const()[name = string("op_1836"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_13_cast_fp16 = reshape(shape = var_1836, x = value_cache_updated_13_cast_fp16)[name = string("v_for_attn_13_cast_fp16")];
+            bool var_1840_transpose_x_1 = const()[name = string("op_1840_transpose_x_1"), val = bool(true)];
+            bool var_1840_transpose_y_1 = const()[name = string("op_1840_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1840_cast_fp16 = matmul(transpose_x = var_1840_transpose_x_1, transpose_y = var_1840_transpose_y_1, x = q_55_cast_fp16, y = k_for_attn_13_cast_fp16)[name = string("op_1840_cast_fp16")];
+            fp16 var_1841_to_fp16 = const()[name = string("op_1841_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_25_cast_fp16 = mul(x = var_1840_cast_fp16, y = var_1841_to_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_101_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_27_cast_fp16 = softmax(axis = var_28, x = input_101_cast_fp16)[name = string("attn_weights_27_cast_fp16")];
+            bool attn_output_25_transpose_x_1 = const()[name = string("attn_output_25_transpose_x_1"), val = bool(false)];
+            bool attn_output_25_transpose_y_1 = const()[name = string("attn_output_25_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_1, transpose_y = attn_output_25_transpose_y_1, x = attn_weights_27_cast_fp16, y = v_for_attn_13_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_1850 = const()[name = string("op_1850"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1852 = const()[name = string("op_1852"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_27_cast_fp16 = transpose(perm = var_1850, x = attn_output_25_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 1024, 1, 1]> input_103_cast_fp16 = reshape(shape = var_1852, x = attn_output_27_cast_fp16)[name = string("input_103_cast_fp16")];
+            string x_41_pad_type_0 = const()[name = string("x_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_41_strides_0 = const()[name = string("x_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_41_pad_0 = const()[name = string("x_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_41_dilations_0 = const()[name = string("x_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_41_groups_0 = const()[name = string("x_41_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1867_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34447104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34971456))))[name = string("op_1867_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1867_bias_0_to_fp16 = const()[name = string("op_1867_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34972032)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1867_cast_fp16 = conv(bias = var_1867_bias_0_to_fp16, dilations = x_41_dilations_0, groups = x_41_groups_0, pad = x_41_pad_0, pad_type = x_41_pad_type_0, strides = x_41_strides_0, weight = op_1867_weight_0_to_fp16_palettized, x = input_103_cast_fp16)[name = string("op_1867_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_27_cast_fp16 = add(x = inputs_25_cast_fp16, y = var_1867_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1873_to_fp16 = const()[name = string("op_1873_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1874_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1873_to_fp16)[name = string("op_1874_cast_fp16")];
+            fp32 var_1875_epsilon_0 = const()[name = string("op_1875_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1875_cast_fp16 = rsqrt(epsilon = var_1875_epsilon_0, x = var_1874_cast_fp16)[name = string("op_1875_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1875_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34973120)))];
+            tensor<fp16, [1, 512, 1, 1]> input_105_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_105_cast_fp16")];
+            string input_107_pad_type_0 = const()[name = string("input_107_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_107_strides_0 = const()[name = string("input_107_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_107_pad_0 = const()[name = string("input_107_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_107_dilations_0 = const()[name = string("input_107_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_107_groups_0 = const()[name = string("input_107_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34974208))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35498560))))[name = string("decoder_pre_transformer_layers_6_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_107_cast_fp16 = conv(dilations = input_107_dilations_0, groups = input_107_groups_0, pad = input_107_pad_0, pad_type = input_107_pad_type_0, strides = input_107_strides_0, weight = decoder_pre_transformer_layers_6_mlp_gate_proj_weight_to_fp16_palettized, x = input_105_cast_fp16)[name = string("input_107_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1889_cast_fp16 = silu(x = input_107_cast_fp16)[name = string("op_1889_cast_fp16")];
+            string var_1895_pad_type_0 = const()[name = string("op_1895_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1895_strides_0 = const()[name = string("op_1895_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1895_pad_0 = const()[name = string("op_1895_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1895_dilations_0 = const()[name = string("op_1895_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1895_groups_0 = const()[name = string("op_1895_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35499136))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36023488))))[name = string("decoder_pre_transformer_layers_6_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1895_cast_fp16 = conv(dilations = var_1895_dilations_0, groups = var_1895_groups_0, pad = var_1895_pad_0, pad_type = var_1895_pad_type_0, strides = var_1895_strides_0, weight = decoder_pre_transformer_layers_6_mlp_up_proj_weight_to_fp16_palettized, x = input_105_cast_fp16)[name = string("op_1895_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_109_cast_fp16 = mul(x = var_1889_cast_fp16, y = var_1895_cast_fp16)[name = string("input_109_cast_fp16")];
+            string x_43_pad_type_0 = const()[name = string("x_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_43_strides_0 = const()[name = string("x_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_43_pad_0 = const()[name = string("x_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_43_dilations_0 = const()[name = string("x_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_43_groups_0 = const()[name = string("x_43_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_1906_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36024064))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36548416))))[name = string("op_1906_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_1906_bias_0_to_fp16 = const()[name = string("op_1906_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36548992)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1906_cast_fp16 = conv(bias = var_1906_bias_0_to_fp16, dilations = x_43_dilations_0, groups = x_43_groups_0, pad = x_43_pad_0, pad_type = x_43_pad_type_0, strides = x_43_strides_0, weight = op_1906_weight_0_to_fp16_palettized, x = input_109_cast_fp16)[name = string("op_1906_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_29_cast_fp16 = add(x = inputs_27_cast_fp16, y = var_1906_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1922_to_fp16 = const()[name = string("op_1922_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1923_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1922_to_fp16)[name = string("op_1923_cast_fp16")];
+            fp32 var_1924_epsilon_0 = const()[name = string("op_1924_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1924_cast_fp16 = rsqrt(epsilon = var_1924_epsilon_0, x = var_1923_cast_fp16)[name = string("op_1924_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_29_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1924_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36550080)))];
+            tensor<fp16, [1, 512, 1, 1]> x_45_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_29_cast_fp16)[name = string("x_45_cast_fp16")];
+            string q_57_pad_type_0 = const()[name = string("q_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_57_strides_0 = const()[name = string("q_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_57_pad_0 = const()[name = string("q_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_57_dilations_0 = const()[name = string("q_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_57_groups_0 = const()[name = string("q_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_self_attn_q_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36551168))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37075520))))[name = string("decoder_pre_transformer_layers_7_self_attn_q_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> q_57_cast_fp16 = conv(dilations = q_57_dilations_0, groups = q_57_groups_0, pad = q_57_pad_0, pad_type = q_57_pad_type_0, strides = q_57_strides_0, weight = decoder_pre_transformer_layers_7_self_attn_q_proj_weight_to_fp16_palettized, x = x_45_cast_fp16)[name = string("q_57_cast_fp16")];
+            string k_29_pad_type_0 = const()[name = string("k_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_29_strides_0 = const()[name = string("k_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_29_pad_0 = const()[name = string("k_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_29_dilations_0 = const()[name = string("k_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_29_groups_0 = const()[name = string("k_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_self_attn_k_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37076096))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37600448))))[name = string("decoder_pre_transformer_layers_7_self_attn_k_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> k_29_cast_fp16 = conv(dilations = k_29_dilations_0, groups = k_29_groups_0, pad = k_29_pad_0, pad_type = k_29_pad_type_0, strides = k_29_strides_0, weight = decoder_pre_transformer_layers_7_self_attn_k_proj_weight_to_fp16_palettized, x = x_45_cast_fp16)[name = string("k_29_cast_fp16")];
+            string v_pad_type_0 = const()[name = string("v_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_strides_0 = const()[name = string("v_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_pad_0 = const()[name = string("v_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_dilations_0 = const()[name = string("v_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_groups_0 = const()[name = string("v_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_self_attn_v_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37601024))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38125376))))[name = string("decoder_pre_transformer_layers_7_self_attn_v_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> v_cast_fp16 = conv(dilations = v_dilations_0, groups = v_groups_0, pad = v_pad_0, pad_type = v_pad_type_0, strides = v_strides_0, weight = decoder_pre_transformer_layers_7_self_attn_v_proj_weight_to_fp16_palettized, x = x_45_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_1956 = const()[name = string("op_1956"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_59_cast_fp16 = reshape(shape = var_1956, x = q_57_cast_fp16)[name = string("q_59_cast_fp16")];
+            tensor<int32, [4]> var_1961 = const()[name = string("op_1961"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_cast_fp16 = reshape(shape = var_1961, x = k_29_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1973_cast_fp16 = mul(x = q_59_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1973_cast_fp16")];
+            tensor<int32, [4]> var_1978_begin_0 = const()[name = string("op_1978_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1978_end_0 = const()[name = string("op_1978_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1978_end_mask_0 = const()[name = string("op_1978_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1978_cast_fp16 = slice_by_index(begin = var_1978_begin_0, end = var_1978_end_0, end_mask = var_1978_end_mask_0, x = q_59_cast_fp16)[name = string("op_1978_cast_fp16")];
+            tensor<int32, [4]> var_1985_begin_0 = const()[name = string("op_1985_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1985_end_0 = const()[name = string("op_1985_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1985_end_mask_0 = const()[name = string("op_1985_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1985_cast_fp16 = slice_by_index(begin = var_1985_begin_0, end = var_1985_end_0, end_mask = var_1985_end_mask_0, x = q_59_cast_fp16)[name = string("op_1985_cast_fp16")];
+            fp16 const_80_promoted_to_fp16 = const()[name = string("const_80_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1988_cast_fp16 = mul(x = var_1985_cast_fp16, y = const_80_promoted_to_fp16)[name = string("op_1988_cast_fp16")];
+            bool var_1990_interleave_0 = const()[name = string("op_1990_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1990_cast_fp16 = concat(axis = var_32, interleave = var_1990_interleave_0, values = (var_1988_cast_fp16, var_1978_cast_fp16))[name = string("op_1990_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1991_cast_fp16 = mul(x = var_1990_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1991_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_cast_fp16 = add(x = var_1973_cast_fp16, y = var_1991_cast_fp16)[name = string("q_rotated_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1993_cast_fp16 = mul(x = k_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1993_cast_fp16")];
+            tensor<int32, [4]> var_1998_begin_0 = const()[name = string("op_1998_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1998_end_0 = const()[name = string("op_1998_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1998_end_mask_0 = const()[name = string("op_1998_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1998_cast_fp16 = slice_by_index(begin = var_1998_begin_0, end = var_1998_end_0, end_mask = var_1998_end_mask_0, x = k_cast_fp16)[name = string("op_1998_cast_fp16")];
+            tensor<int32, [4]> var_2005_begin_0 = const()[name = string("op_2005_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_2005_end_0 = const()[name = string("op_2005_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_2005_end_mask_0 = const()[name = string("op_2005_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_2005_cast_fp16 = slice_by_index(begin = var_2005_begin_0, end = var_2005_end_0, end_mask = var_2005_end_mask_0, x = k_cast_fp16)[name = string("op_2005_cast_fp16")];
+            fp16 const_83_promoted_to_fp16 = const()[name = string("const_83_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_2008_cast_fp16 = mul(x = var_2005_cast_fp16, y = const_83_promoted_to_fp16)[name = string("op_2008_cast_fp16")];
+            bool var_2010_interleave_0 = const()[name = string("op_2010_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_2010_cast_fp16 = concat(axis = var_32, interleave = var_2010_interleave_0, values = (var_2008_cast_fp16, var_1998_cast_fp16))[name = string("op_2010_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_2011_cast_fp16 = mul(x = var_2010_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2011_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_cast_fp16 = add(x = var_1993_cast_fp16, y = var_2011_cast_fp16)[name = string("k_rotated_cast_fp16")];
+            tensor<int32, [4]> var_2015 = const()[name = string("op_2015"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_2015, x = k_rotated_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2020_cast_fp16 = mul(x = var_522_cast_fp16_7, y = var_647_cast_fp16)[name = string("op_2020_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2021_cast_fp16 = mul(x = current_key_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_2021_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_cast_fp16 = add(x = var_2020_cast_fp16, y = var_2021_cast_fp16)[name = string("key_cache_updated_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2024_cast_fp16 = mul(x = var_531_cast_fp16_7, y = var_647_cast_fp16)[name = string("op_2024_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2025_cast_fp16 = mul(x = v_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_2025_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_cast_fp16 = add(x = var_2024_cast_fp16, y = var_2025_cast_fp16)[name = string("value_cache_updated_cast_fp16")];
+            tensor<int32, [4]> var_2027 = const()[name = string("op_2027"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_cast_fp16 = reshape(shape = var_2027, x = q_rotated_cast_fp16)[name = string("q_cast_fp16")];
+            tensor<int32, [4]> var_2030 = const()[name = string("op_2030"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_cast_fp16 = reshape(shape = var_2030, x = key_cache_updated_cast_fp16)[name = string("k_for_attn_cast_fp16")];
+            tensor<int32, [4]> var_2032 = const()[name = string("op_2032"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_cast_fp16 = reshape(shape = var_2032, x = value_cache_updated_cast_fp16)[name = string("v_for_attn_cast_fp16")];
+            bool var_2036_transpose_x_1 = const()[name = string("op_2036_transpose_x_1"), val = bool(true)];
+            bool var_2036_transpose_y_1 = const()[name = string("op_2036_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_2036_cast_fp16 = matmul(transpose_x = var_2036_transpose_x_1, transpose_y = var_2036_transpose_y_1, x = q_cast_fp16, y = k_for_attn_cast_fp16)[name = string("op_2036_cast_fp16")];
+            fp16 var_2037_to_fp16 = const()[name = string("op_2037_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_29_cast_fp16 = mul(x = var_2036_cast_fp16, y = var_2037_to_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_111_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_111_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_cast_fp16 = softmax(axis = var_28, x = input_111_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool attn_output_29_transpose_x_1 = const()[name = string("attn_output_29_transpose_x_1"), val = bool(false)];
+            bool attn_output_29_transpose_y_1 = const()[name = string("attn_output_29_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_29_cast_fp16 = matmul(transpose_x = attn_output_29_transpose_x_1, transpose_y = attn_output_29_transpose_y_1, x = attn_weights_cast_fp16, y = v_for_attn_cast_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<int32, [4]> var_2046 = const()[name = string("op_2046"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_2048 = const()[name = string("op_2048"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_cast_fp16 = transpose(perm = var_2046, x = attn_output_29_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 1024, 1, 1]> input_113_cast_fp16 = reshape(shape = var_2048, x = attn_output_cast_fp16)[name = string("input_113_cast_fp16")];
+            string x_47_pad_type_0 = const()[name = string("x_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_47_strides_0 = const()[name = string("x_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_47_pad_0 = const()[name = string("x_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_47_dilations_0 = const()[name = string("x_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_47_groups_0 = const()[name = string("x_47_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_2063_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38125952))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38650304))))[name = string("op_2063_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_2063_bias_0_to_fp16 = const()[name = string("op_2063_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38650880)))];
+            tensor<fp16, [1, 512, 1, 1]> var_2063_cast_fp16 = conv(bias = var_2063_bias_0_to_fp16, dilations = x_47_dilations_0, groups = x_47_groups_0, pad = x_47_pad_0, pad_type = x_47_pad_type_0, strides = x_47_strides_0, weight = op_2063_weight_0_to_fp16_palettized, x = input_113_cast_fp16)[name = string("op_2063_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_29_cast_fp16, y = var_2063_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_2069_to_fp16 = const()[name = string("op_2069_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_2070_cast_fp16 = add(x = variance_31_cast_fp16, y = var_2069_to_fp16)[name = string("op_2070_cast_fp16")];
+            fp32 var_2071_epsilon_0 = const()[name = string("op_2071_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2071_cast_fp16 = rsqrt(epsilon = var_2071_epsilon_0, x = var_2070_cast_fp16)[name = string("op_2071_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_2071_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38651968)))];
+            tensor<fp16, [1, 512, 1, 1]> input_115_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_31_cast_fp16)[name = string("input_115_cast_fp16")];
+            string input_117_pad_type_0 = const()[name = string("input_117_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_117_strides_0 = const()[name = string("input_117_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_117_pad_0 = const()[name = string("input_117_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_117_dilations_0 = const()[name = string("input_117_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_117_groups_0 = const()[name = string("input_117_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_mlp_gate_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38653056))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39177408))))[name = string("decoder_pre_transformer_layers_7_mlp_gate_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> input_117_cast_fp16 = conv(dilations = input_117_dilations_0, groups = input_117_groups_0, pad = input_117_pad_0, pad_type = input_117_pad_type_0, strides = input_117_strides_0, weight = decoder_pre_transformer_layers_7_mlp_gate_proj_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = string("input_117_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_2085_cast_fp16 = silu(x = input_117_cast_fp16)[name = string("op_2085_cast_fp16")];
+            string var_2091_pad_type_0 = const()[name = string("op_2091_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2091_strides_0 = const()[name = string("op_2091_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2091_pad_0 = const()[name = string("op_2091_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2091_dilations_0 = const()[name = string("op_2091_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2091_groups_0 = const()[name = string("op_2091_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_mlp_up_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39177984))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39702336))))[name = string("decoder_pre_transformer_layers_7_mlp_up_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 1, 1]> var_2091_cast_fp16 = conv(dilations = var_2091_dilations_0, groups = var_2091_groups_0, pad = var_2091_pad_0, pad_type = var_2091_pad_type_0, strides = var_2091_strides_0, weight = decoder_pre_transformer_layers_7_mlp_up_proj_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = string("op_2091_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_119_cast_fp16 = mul(x = var_2085_cast_fp16, y = var_2091_cast_fp16)[name = string("input_119_cast_fp16")];
+            string x_49_pad_type_0 = const()[name = string("x_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_49_strides_0 = const()[name = string("x_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_49_pad_0 = const()[name = string("x_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_49_dilations_0 = const()[name = string("x_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_49_groups_0 = const()[name = string("x_49_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> op_2102_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39702912))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40227264))))[name = string("op_2102_weight_0_to_fp16_palettized")];
+            tensor<fp16, [512]> var_2102_bias_0_to_fp16 = const()[name = string("op_2102_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40227840)))];
+            tensor<fp16, [1, 512, 1, 1]> var_2102_cast_fp16 = conv(bias = var_2102_bias_0_to_fp16, dilations = x_49_dilations_0, groups = x_49_groups_0, pad = x_49_pad_0, pad_type = x_49_pad_type_0, strides = x_49_strides_0, weight = op_2102_weight_0_to_fp16_palettized, x = input_119_cast_fp16)[name = string("op_2102_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_cast_fp16 = add(x = inputs_31_cast_fp16, y = var_2102_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_2112_to_fp16 = const()[name = string("op_2112_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_2113_cast_fp16 = add(x = variance_cast_fp16, y = var_2112_to_fp16)[name = string("op_2113_cast_fp16")];
+            fp32 var_2114_epsilon_0 = const()[name = string("op_2114_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2114_cast_fp16 = rsqrt(epsilon = var_2114_epsilon_0, x = var_2113_cast_fp16)[name = string("op_2114_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_2114_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40228928)))];
+            tensor<fp16, [1, 512, 1, 1]> input_121_cast_fp16 = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_121_cast_fp16")];
+            string new_hidden_pad_type_0 = const()[name = string("new_hidden_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> new_hidden_strides_0 = const()[name = string("new_hidden_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> new_hidden_pad_0 = const()[name = string("new_hidden_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> new_hidden_dilations_0 = const()[name = string("new_hidden_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 new_hidden_groups_0 = const()[name = string("new_hidden_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_output_proj_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40230016))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40754368))))[name = string("decoder_pre_transformer_output_proj_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> decoder_pre_transformer_output_proj_bias_to_fp16 = const()[name = string("decoder_pre_transformer_output_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40754944)))];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_context_update = conv(bias = decoder_pre_transformer_output_proj_bias_to_fp16, dilations = new_hidden_dilations_0, groups = new_hidden_groups_0, pad = new_hidden_pad_0, pad_type = new_hidden_pad_type_0, strides = new_hidden_strides_0, weight = decoder_pre_transformer_output_proj_weight_to_fp16_palettized, x = input_121_cast_fp16)[name = string("new_hidden_cast_fp16")];
+            bool var_2127_interleave_0 = const()[name = string("op_2127_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8192, 1, 1]> key_cache_updates = concat(axis = var_32, interleave = var_2127_interleave_0, values = (current_key_1_cast_fp16, current_key_3_cast_fp16, current_key_5_cast_fp16, current_key_7_cast_fp16, current_key_9_cast_fp16, current_key_11_cast_fp16, current_key_13_cast_fp16, current_key_cast_fp16))[name = string("op_2127_cast_fp16")];
+            bool var_2129_interleave_0 = const()[name = string("op_2129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8192, 1, 1]> value_cache_updates = concat(axis = var_32, interleave = var_2129_interleave_0, values = (v_1_cast_fp16, v_3_cast_fp16, v_5_cast_fp16, v_7_cast_fp16, v_9_cast_fp16, v_11_cast_fp16, v_13_cast_fp16, v_cast_fp16))[name = string("op_2129_cast_fp16")];
+            bool x_51_interleave_0 = const()[name = string("x_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1024, 1, 5]> x_51_cast_fp16 = concat(axis = var_28, interleave = x_51_interleave_0, values = (hidden_context, hidden_context_update))[name = string("x_51_cast_fp16")];
+            string x_53_pad_type_0 = const()[name = string("x_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_53_strides_0 = const()[name = string("x_53_strides_0"), val = tensor<int32, [2]>([1, 2])];
+            tensor<int32, [4]> x_53_pad_0 = const()[name = string("x_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_53_dilations_0 = const()[name = string("x_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_53_groups_0 = const()[name = string("x_53_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_53_has_output_shape_output_shape_0 = const()[name = string("x_53_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 1024, 1, 10])];
+            tensor<fp16, [1024, 1024, 1, 2]> decoder_upsample_0_0_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 2]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40757056))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42854272))))[name = string("decoder_upsample_0_0_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> decoder_upsample_0_0_conv_bias_to_fp16 = const()[name = string("decoder_upsample_0_0_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42854848)))];
+            tensor<fp16, [1, 1024, 1, 10]> x_53_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_upsample_0_0_conv_bias_to_fp16, dilations = x_53_dilations_0, groups = x_53_groups_0, output_shape = x_53_has_output_shape_output_shape_0, pad = x_53_pad_0, pad_type = x_53_pad_type_0, strides = x_53_strides_0, weight = decoder_upsample_0_0_conv_weight_to_fp16_palettized, x = x_51_cast_fp16)[name = string("x_53_has_output_shape_cast_fp16")];
+            tensor<int32, [8]> input_123_pad_0 = const()[name = string("input_123_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_123_mode_0 = const()[name = string("input_123_mode_0"), val = string("constant")];
+            fp16 const_86_to_fp16 = const()[name = string("const_86_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 1, 16]> input_123_cast_fp16 = pad(constant_val = const_86_to_fp16, mode = input_123_mode_0, pad = input_123_pad_0, x = x_53_has_output_shape_cast_fp16)[name = string("input_123_cast_fp16")];
+            string x_57_pad_type_0 = const()[name = string("x_57_pad_type_0"), val = string("valid")];
+            int32 x_57_groups_0 = const()[name = string("x_57_groups_0"), val = int32(1024)];
+            tensor<int32, [2]> x_57_strides_0 = const()[name = string("x_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_57_pad_0 = const()[name = string("x_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_57_dilations_0 = const()[name = string("x_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1024, 1, 1, 7]> decoder_upsample_0_1_dwconv_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42856960))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42864192))))[name = string("decoder_upsample_0_1_dwconv_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> decoder_upsample_0_1_dwconv_conv_bias_to_fp16 = const()[name = string("decoder_upsample_0_1_dwconv_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42864768)))];
+            tensor<fp16, [1, 1024, 1, 10]> x_57_cast_fp16 = conv(bias = decoder_upsample_0_1_dwconv_conv_bias_to_fp16, dilations = x_57_dilations_0, groups = x_57_groups_0, pad = x_57_pad_0, pad_type = x_57_pad_type_0, strides = x_57_strides_0, weight = decoder_upsample_0_1_dwconv_conv_weight_to_fp16_palettized, x = input_123_cast_fp16)[name = string("x_57_cast_fp16")];
+            tensor<int32, [1]> var_2169_axes_0 = const()[name = string("op_2169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 10]> var_2169_cast_fp16 = squeeze(axes = var_2169_axes_0, x = x_57_cast_fp16)[name = string("op_2169_cast_fp16")];
+            tensor<int32, [3]> var_2170 = const()[name = string("op_2170"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> x_59_axes_0 = const()[name = string("x_59_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1024]> decoder_upsample_0_1_norm_weight_to_fp16 = const()[name = string("decoder_upsample_0_1_norm_weight_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42866880)))];
+            tensor<fp16, [1024]> decoder_upsample_0_1_norm_bias_to_fp16 = const()[name = string("decoder_upsample_0_1_norm_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42868992)))];
+            fp16 var_17_to_fp16 = const()[name = string("op_17_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 10, 1024]> input_125_cast_fp16 = transpose(perm = var_2170, x = var_2169_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 10, 1024]> x_59_cast_fp16 = layer_norm(axes = x_59_axes_0, beta = decoder_upsample_0_1_norm_bias_to_fp16, epsilon = var_17_to_fp16, gamma = decoder_upsample_0_1_norm_weight_to_fp16, x = input_125_cast_fp16)[name = string("x_59_cast_fp16")];
+            tensor<int32, [3]> var_2176 = const()[name = string("op_2176"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = string("input_127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 10]> var_2177_cast_fp16 = transpose(perm = var_2176, x = x_59_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 1024, 1, 10]> input_127_cast_fp16 = expand_dims(axes = input_127_axes_0, x = var_2177_cast_fp16)[name = string("input_127_cast_fp16")];
+            string input_129_pad_type_0 = const()[name = string("input_129_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_129_strides_0 = const()[name = string("input_129_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_129_pad_0 = const()[name = string("input_129_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_129_dilations_0 = const()[name = string("input_129_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_129_groups_0 = const()[name = string("input_129_groups_0"), val = int32(1)];
+            tensor<fp16, [4096, 1024, 1, 1]> decoder_upsample_0_1_pwconv1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [4096, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42871104))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47065472))))[name = string("decoder_upsample_0_1_pwconv1_weight_to_fp16_palettized")];
+            tensor<fp16, [4096]> decoder_upsample_0_1_pwconv1_bias_to_fp16 = const()[name = string("decoder_upsample_0_1_pwconv1_bias_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47066048)))];
+            tensor<fp16, [1, 4096, 1, 10]> input_129_cast_fp16 = conv(bias = decoder_upsample_0_1_pwconv1_bias_to_fp16, dilations = input_129_dilations_0, groups = input_129_groups_0, pad = input_129_pad_0, pad_type = input_129_pad_type_0, strides = input_129_strides_0, weight = decoder_upsample_0_1_pwconv1_weight_to_fp16_palettized, x = input_127_cast_fp16)[name = string("input_129_cast_fp16")];
+            string input_131_mode_0 = const()[name = string("input_131_mode_0"), val = string("EXACT")];
+            tensor<fp16, [1, 4096, 1, 10]> input_131_cast_fp16 = gelu(mode = input_131_mode_0, x = input_129_cast_fp16)[name = string("input_131_cast_fp16")];
+            string x_61_pad_type_0 = const()[name = string("x_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_61_strides_0 = const()[name = string("x_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_61_pad_0 = const()[name = string("x_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_61_dilations_0 = const()[name = string("x_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_61_groups_0 = const()[name = string("x_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 4096, 1, 1]> x_63_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47074304))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51268672))))[name = string("x_63_weight_0_to_fp16_palettized")];
+            tensor<fp16, [1024]> x_63_bias_0_to_fp16 = const()[name = string("x_63_bias_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51269248)))];
+            tensor<fp16, [1, 1024, 1, 10]> x_63_cast_fp16 = conv(bias = x_63_bias_0_to_fp16, dilations = x_61_dilations_0, groups = x_61_groups_0, pad = x_61_pad_0, pad_type = x_61_pad_type_0, strides = x_61_strides_0, weight = x_63_weight_0_to_fp16_palettized, x = input_131_cast_fp16)[name = string("x_63_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 10]> x_65_cast_fp16 = add(x = x_53_has_output_shape_cast_fp16, y = x_63_cast_fp16)[name = string("x_65_cast_fp16")];
+            string x_67_pad_type_0 = const()[name = string("x_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_67_strides_0 = const()[name = string("x_67_strides_0"), val = tensor<int32, [2]>([1, 2])];
+            tensor<int32, [4]> x_67_pad_0 = const()[name = string("x_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_67_dilations_0 = const()[name = string("x_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_67_groups_0 = const()[name = string("x_67_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_67_has_output_shape_output_shape_0 = const()[name = string("x_67_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 1024, 1, 20])];
+            tensor<fp16, [1024, 1024, 1, 2]> decoder_upsample_1_0_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1024, 1, 2]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51271360))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53368576))))[name = string("decoder_upsample_1_0_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> decoder_upsample_1_0_conv_bias_to_fp16 = const()[name = string("decoder_upsample_1_0_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53369152)))];
+            tensor<fp16, [1, 1024, 1, 20]> x_67_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_upsample_1_0_conv_bias_to_fp16, dilations = x_67_dilations_0, groups = x_67_groups_0, output_shape = x_67_has_output_shape_output_shape_0, pad = x_67_pad_0, pad_type = x_67_pad_type_0, strides = x_67_strides_0, weight = decoder_upsample_1_0_conv_weight_to_fp16_palettized, x = x_65_cast_fp16)[name = string("x_67_has_output_shape_cast_fp16")];
+            tensor<int32, [8]> input_133_pad_0 = const()[name = string("input_133_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_133_mode_0 = const()[name = string("input_133_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 1, 26]> input_133_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = input_133_mode_0, pad = input_133_pad_0, x = x_67_has_output_shape_cast_fp16)[name = string("input_133_cast_fp16")];
+            string x_71_pad_type_0 = const()[name = string("x_71_pad_type_0"), val = string("valid")];
+            int32 x_71_groups_0 = const()[name = string("x_71_groups_0"), val = int32(1024)];
+            tensor<int32, [2]> x_71_strides_0 = const()[name = string("x_71_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_71_pad_0 = const()[name = string("x_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_71_dilations_0 = const()[name = string("x_71_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1024, 1, 1, 7]> decoder_upsample_1_1_dwconv_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 1, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53371264))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53378496))))[name = string("decoder_upsample_1_1_dwconv_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> decoder_upsample_1_1_dwconv_conv_bias_to_fp16 = const()[name = string("decoder_upsample_1_1_dwconv_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53379072)))];
+            tensor<fp16, [1, 1024, 1, 20]> x_71_cast_fp16 = conv(bias = decoder_upsample_1_1_dwconv_conv_bias_to_fp16, dilations = x_71_dilations_0, groups = x_71_groups_0, pad = x_71_pad_0, pad_type = x_71_pad_type_0, strides = x_71_strides_0, weight = decoder_upsample_1_1_dwconv_conv_weight_to_fp16_palettized, x = input_133_cast_fp16)[name = string("x_71_cast_fp16")];
+            tensor<int32, [1]> var_2231_axes_0 = const()[name = string("op_2231_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 20]> var_2231_cast_fp16 = squeeze(axes = var_2231_axes_0, x = x_71_cast_fp16)[name = string("op_2231_cast_fp16")];
+            tensor<int32, [3]> var_2232 = const()[name = string("op_2232"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> x_73_axes_0 = const()[name = string("x_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1024]> decoder_upsample_1_1_norm_weight_to_fp16 = const()[name = string("decoder_upsample_1_1_norm_weight_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53381184)))];
+            tensor<fp16, [1024]> decoder_upsample_1_1_norm_bias_to_fp16 = const()[name = string("decoder_upsample_1_1_norm_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53383296)))];
+            tensor<fp16, [1, 20, 1024]> input_135_cast_fp16 = transpose(perm = var_2232, x = var_2231_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 20, 1024]> x_73_cast_fp16 = layer_norm(axes = x_73_axes_0, beta = decoder_upsample_1_1_norm_bias_to_fp16, epsilon = var_17_to_fp16, gamma = decoder_upsample_1_1_norm_weight_to_fp16, x = input_135_cast_fp16)[name = string("x_73_cast_fp16")];
+            tensor<int32, [3]> var_2238 = const()[name = string("op_2238"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_137_axes_0 = const()[name = string("input_137_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 20]> var_2239_cast_fp16 = transpose(perm = var_2238, x = x_73_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 1024, 1, 20]> input_137_cast_fp16 = expand_dims(axes = input_137_axes_0, x = var_2239_cast_fp16)[name = string("input_137_cast_fp16")];
+            string input_139_pad_type_0 = const()[name = string("input_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_139_strides_0 = const()[name = string("input_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_139_pad_0 = const()[name = string("input_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_139_dilations_0 = const()[name = string("input_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_139_groups_0 = const()[name = string("input_139_groups_0"), val = int32(1)];
+            tensor<fp16, [4096, 1024, 1, 1]> decoder_upsample_1_1_pwconv1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [4096, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53385408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57579776))))[name = string("decoder_upsample_1_1_pwconv1_weight_to_fp16_palettized")];
+            tensor<fp16, [4096]> decoder_upsample_1_1_pwconv1_bias_to_fp16 = const()[name = string("decoder_upsample_1_1_pwconv1_bias_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57580352)))];
+            tensor<fp16, [1, 4096, 1, 20]> input_139_cast_fp16 = conv(bias = decoder_upsample_1_1_pwconv1_bias_to_fp16, dilations = input_139_dilations_0, groups = input_139_groups_0, pad = input_139_pad_0, pad_type = input_139_pad_type_0, strides = input_139_strides_0, weight = decoder_upsample_1_1_pwconv1_weight_to_fp16_palettized, x = input_137_cast_fp16)[name = string("input_139_cast_fp16")];
+            string input_141_mode_0 = const()[name = string("input_141_mode_0"), val = string("EXACT")];
+            tensor<fp16, [1, 4096, 1, 20]> input_141_cast_fp16 = gelu(mode = input_141_mode_0, x = input_139_cast_fp16)[name = string("input_141_cast_fp16")];
+            string x_75_pad_type_0 = const()[name = string("x_75_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_75_strides_0 = const()[name = string("x_75_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_75_pad_0 = const()[name = string("x_75_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_75_dilations_0 = const()[name = string("x_75_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_75_groups_0 = const()[name = string("x_75_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 4096, 1, 1]> x_77_weight_0_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57588608))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61782976))))[name = string("x_77_weight_0_to_fp16_palettized")];
+            tensor<fp16, [1024]> x_77_bias_0_to_fp16 = const()[name = string("x_77_bias_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61783552)))];
+            tensor<fp16, [1, 1024, 1, 20]> x_77_cast_fp16 = conv(bias = x_77_bias_0_to_fp16, dilations = x_75_dilations_0, groups = x_75_groups_0, pad = x_75_pad_0, pad_type = x_75_pad_type_0, strides = x_75_strides_0, weight = x_77_weight_0_to_fp16_palettized, x = input_141_cast_fp16)[name = string("x_77_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 20]> x_79_cast_fp16 = add(x = x_67_has_output_shape_cast_fp16, y = x_77_cast_fp16)[name = string("x_79_cast_fp16")];
+            tensor<int32, [8]> input_143_pad_0 = const()[name = string("input_143_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_143_mode_0 = const()[name = string("input_143_mode_0"), val = string("constant")];
+            fp16 const_89_to_fp16 = const()[name = string("const_89_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 1, 26]> input_143_cast_fp16 = pad(constant_val = const_89_to_fp16, mode = input_143_mode_0, pad = input_143_pad_0, x = x_79_cast_fp16)[name = string("input_143_cast_fp16")];
+            string x_81_pad_type_0 = const()[name = string("x_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_81_strides_0 = const()[name = string("x_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_81_pad_0 = const()[name = string("x_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_81_dilations_0 = const()[name = string("x_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_81_groups_0 = const()[name = string("x_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1536, 1024, 1, 7]> decoder_decoder_0_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1536, 1024, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61785664))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72795776))))[name = string("decoder_decoder_0_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1536]> decoder_decoder_0_conv_bias_to_fp16 = const()[name = string("decoder_decoder_0_conv_bias_to_fp16"), val = tensor<fp16, [1536]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72796352)))];
+            tensor<fp16, [1, 1536, 1, 20]> x_81_cast_fp16 = conv(bias = decoder_decoder_0_conv_bias_to_fp16, dilations = x_81_dilations_0, groups = x_81_groups_0, pad = x_81_pad_0, pad_type = x_81_pad_type_0, strides = x_81_strides_0, weight = decoder_decoder_0_conv_weight_to_fp16_palettized, x = input_143_cast_fp16)[name = string("x_81_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 1]> alpha_1_to_fp16 = const()[name = string("alpha_1_to_fp16"), val = tensor<fp16, [1, 1536, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72799488)))];
+            tensor<fp16, [1, 1536, 1, 20]> var_2288_cast_fp16 = mul(x = x_81_cast_fp16, y = alpha_1_to_fp16)[name = string("op_2288_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 20]> sin_val_1_cast_fp16 = sin(x = var_2288_cast_fp16)[name = string("sin_val_1_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 20]> var_2295_cast_fp16 = mul(x = sin_val_1_cast_fp16, y = sin_val_1_cast_fp16)[name = string("op_2295_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 1]> var_2292_to_fp16 = const()[name = string("op_2292_to_fp16"), val = tensor<fp16, [1, 1536, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72802624)))];
+            tensor<fp16, [1, 1536, 1, 20]> var_2296_cast_fp16 = mul(x = var_2292_to_fp16, y = var_2295_cast_fp16)[name = string("op_2296_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 20]> x_83_cast_fp16 = add(x = x_81_cast_fp16, y = var_2296_cast_fp16)[name = string("x_83_cast_fp16")];
+            string x_85_pad_type_0 = const()[name = string("x_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_85_strides_0 = const()[name = string("x_85_strides_0"), val = tensor<int32, [2]>([1, 8])];
+            tensor<int32, [4]> x_85_pad_0 = const()[name = string("x_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_85_dilations_0 = const()[name = string("x_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_85_groups_0 = const()[name = string("x_85_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_85_has_output_shape_output_shape_0 = const()[name = string("x_85_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 768, 1, 168])];
+            tensor<fp16, [1536, 768, 1, 16]> decoder_decoder_1_block_1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1536, 768, 1, 16]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72805760))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91680192))))[name = string("decoder_decoder_1_block_1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91680768)))];
+            tensor<fp16, [1, 768, 1, 168]> x_85_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_1_block_1_conv_bias_to_fp16, dilations = x_85_dilations_0, groups = x_85_groups_0, output_shape = x_85_has_output_shape_output_shape_0, pad = x_85_pad_0, pad_type = x_85_pad_type_0, strides = x_85_strides_0, weight = decoder_decoder_1_block_1_conv_weight_to_fp16_palettized, x = x_83_cast_fp16)[name = string("x_85_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_87_begin_0 = const()[name = string("x_87_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 8])];
+            tensor<int32, [4]> x_87_end_0 = const()[name = string("x_87_end_0"), val = tensor<int32, [4]>([1, 768, 1, 160])];
+            tensor<bool, [4]> x_87_end_mask_0 = const()[name = string("x_87_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 768, 1, 152]> x_87_cast_fp16 = slice_by_index(begin = x_87_begin_0, end = x_87_end_0, end_mask = x_87_end_mask_0, x = x_85_has_output_shape_cast_fp16)[name = string("x_87_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_3_to_fp16 = const()[name = string("alpha_3_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91682368)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2328_cast_fp16 = mul(x = x_87_cast_fp16, y = alpha_3_to_fp16)[name = string("op_2328_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_3_cast_fp16 = sin(x = var_2328_cast_fp16)[name = string("sin_val_3_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2335_cast_fp16 = mul(x = sin_val_3_cast_fp16, y = sin_val_3_cast_fp16)[name = string("op_2335_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2332_to_fp16 = const()[name = string("op_2332_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91683968)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2336_cast_fp16 = mul(x = var_2332_to_fp16, y = var_2335_cast_fp16)[name = string("op_2336_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_89_cast_fp16 = add(x = x_87_cast_fp16, y = var_2336_cast_fp16)[name = string("x_89_cast_fp16")];
+            tensor<int32, [8]> input_145_pad_0 = const()[name = string("input_145_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_145_mode_0 = const()[name = string("input_145_mode_0"), val = string("constant")];
+            fp16 const_91_to_fp16 = const()[name = string("const_91_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 768, 1, 158]> input_145_cast_fp16 = pad(constant_val = const_91_to_fp16, mode = input_145_mode_0, pad = input_145_pad_0, x = x_89_cast_fp16)[name = string("input_145_cast_fp16")];
+            string x_91_pad_type_0 = const()[name = string("x_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_91_strides_0 = const()[name = string("x_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_91_pad_0 = const()[name = string("x_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_91_dilations_0 = const()[name = string("x_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_91_groups_0 = const()[name = string("x_91_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 7]> decoder_decoder_1_block_2_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 768, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91685568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95814400))))[name = string("decoder_decoder_1_block_2_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95814976)))];
+            tensor<fp16, [1, 768, 1, 152]> x_91_cast_fp16 = conv(bias = decoder_decoder_1_block_2_conv1_conv_bias_to_fp16, dilations = x_91_dilations_0, groups = x_91_groups_0, pad = x_91_pad_0, pad_type = x_91_pad_type_0, strides = x_91_strides_0, weight = decoder_decoder_1_block_2_conv1_conv_weight_to_fp16_palettized, x = input_145_cast_fp16)[name = string("x_91_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_5_to_fp16 = const()[name = string("alpha_5_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95816576)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2356_cast_fp16 = mul(x = x_91_cast_fp16, y = alpha_5_to_fp16)[name = string("op_2356_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_5_cast_fp16 = sin(x = var_2356_cast_fp16)[name = string("sin_val_5_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2363_cast_fp16 = mul(x = sin_val_5_cast_fp16, y = sin_val_5_cast_fp16)[name = string("op_2363_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2360_to_fp16 = const()[name = string("op_2360_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95818176)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2364_cast_fp16 = mul(x = var_2360_to_fp16, y = var_2363_cast_fp16)[name = string("op_2364_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_93_cast_fp16 = add(x = x_91_cast_fp16, y = var_2364_cast_fp16)[name = string("x_93_cast_fp16")];
+            string x_95_pad_type_0 = const()[name = string("x_95_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_95_strides_0 = const()[name = string("x_95_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_95_pad_0 = const()[name = string("x_95_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_95_dilations_0 = const()[name = string("x_95_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_95_groups_0 = const()[name = string("x_95_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 1]> decoder_decoder_1_block_2_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95819776))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96409664))))[name = string("decoder_decoder_1_block_2_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96410240)))];
+            tensor<fp16, [1, 768, 1, 152]> x_95_cast_fp16 = conv(bias = decoder_decoder_1_block_2_conv2_conv_bias_to_fp16, dilations = x_95_dilations_0, groups = x_95_groups_0, pad = x_95_pad_0, pad_type = x_95_pad_type_0, strides = x_95_strides_0, weight = decoder_decoder_1_block_2_conv2_conv_weight_to_fp16_palettized, x = x_93_cast_fp16)[name = string("x_95_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_97_cast_fp16 = add(x = x_95_cast_fp16, y = x_87_cast_fp16)[name = string("x_97_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_7_to_fp16 = const()[name = string("alpha_7_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96411840)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2389_cast_fp16 = mul(x = x_97_cast_fp16, y = alpha_7_to_fp16)[name = string("op_2389_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_7_cast_fp16 = sin(x = var_2389_cast_fp16)[name = string("sin_val_7_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2396_cast_fp16 = mul(x = sin_val_7_cast_fp16, y = sin_val_7_cast_fp16)[name = string("op_2396_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2393_to_fp16 = const()[name = string("op_2393_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96413440)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2397_cast_fp16 = mul(x = var_2393_to_fp16, y = var_2396_cast_fp16)[name = string("op_2397_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_99_cast_fp16 = add(x = x_97_cast_fp16, y = var_2397_cast_fp16)[name = string("x_99_cast_fp16")];
+            tensor<int32, [8]> input_149_pad_0 = const()[name = string("input_149_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_149_mode_0 = const()[name = string("input_149_mode_0"), val = string("constant")];
+            fp16 const_93_to_fp16 = const()[name = string("const_93_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 768, 1, 170]> input_149_cast_fp16 = pad(constant_val = const_93_to_fp16, mode = input_149_mode_0, pad = input_149_pad_0, x = x_99_cast_fp16)[name = string("input_149_cast_fp16")];
+            string x_101_pad_type_0 = const()[name = string("x_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_101_dilations_0 = const()[name = string("x_101_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_101_strides_0 = const()[name = string("x_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_101_pad_0 = const()[name = string("x_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_101_groups_0 = const()[name = string("x_101_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 7]> decoder_decoder_1_block_3_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 768, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96415040))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100543872))))[name = string("decoder_decoder_1_block_3_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100544448)))];
+            tensor<fp16, [1, 768, 1, 152]> x_101_cast_fp16 = conv(bias = decoder_decoder_1_block_3_conv1_conv_bias_to_fp16, dilations = x_101_dilations_0, groups = x_101_groups_0, pad = x_101_pad_0, pad_type = x_101_pad_type_0, strides = x_101_strides_0, weight = decoder_decoder_1_block_3_conv1_conv_weight_to_fp16_palettized, x = input_149_cast_fp16)[name = string("x_101_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_9_to_fp16 = const()[name = string("alpha_9_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100546048)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2417_cast_fp16 = mul(x = x_101_cast_fp16, y = alpha_9_to_fp16)[name = string("op_2417_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_9_cast_fp16 = sin(x = var_2417_cast_fp16)[name = string("sin_val_9_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2424_cast_fp16 = mul(x = sin_val_9_cast_fp16, y = sin_val_9_cast_fp16)[name = string("op_2424_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2421_to_fp16 = const()[name = string("op_2421_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100547648)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2425_cast_fp16 = mul(x = var_2421_to_fp16, y = var_2424_cast_fp16)[name = string("op_2425_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_103_cast_fp16 = add(x = x_101_cast_fp16, y = var_2425_cast_fp16)[name = string("x_103_cast_fp16")];
+            string x_105_pad_type_0 = const()[name = string("x_105_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_105_strides_0 = const()[name = string("x_105_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_105_pad_0 = const()[name = string("x_105_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_105_dilations_0 = const()[name = string("x_105_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_105_groups_0 = const()[name = string("x_105_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 1]> decoder_decoder_1_block_3_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100549248))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101139136))))[name = string("decoder_decoder_1_block_3_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101139712)))];
+            tensor<fp16, [1, 768, 1, 152]> x_105_cast_fp16 = conv(bias = decoder_decoder_1_block_3_conv2_conv_bias_to_fp16, dilations = x_105_dilations_0, groups = x_105_groups_0, pad = x_105_pad_0, pad_type = x_105_pad_type_0, strides = x_105_strides_0, weight = decoder_decoder_1_block_3_conv2_conv_weight_to_fp16_palettized, x = x_103_cast_fp16)[name = string("x_105_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_107_cast_fp16 = add(x = x_105_cast_fp16, y = x_97_cast_fp16)[name = string("x_107_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_11_to_fp16 = const()[name = string("alpha_11_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101141312)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2450_cast_fp16 = mul(x = x_107_cast_fp16, y = alpha_11_to_fp16)[name = string("op_2450_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_11_cast_fp16 = sin(x = var_2450_cast_fp16)[name = string("sin_val_11_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2457_cast_fp16 = mul(x = sin_val_11_cast_fp16, y = sin_val_11_cast_fp16)[name = string("op_2457_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2454_to_fp16 = const()[name = string("op_2454_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101142912)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2458_cast_fp16 = mul(x = var_2454_to_fp16, y = var_2457_cast_fp16)[name = string("op_2458_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_109_cast_fp16 = add(x = x_107_cast_fp16, y = var_2458_cast_fp16)[name = string("x_109_cast_fp16")];
+            tensor<int32, [8]> input_153_pad_0 = const()[name = string("input_153_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_153_mode_0 = const()[name = string("input_153_mode_0"), val = string("constant")];
+            fp16 const_95_to_fp16 = const()[name = string("const_95_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 768, 1, 206]> input_153_cast_fp16 = pad(constant_val = const_95_to_fp16, mode = input_153_mode_0, pad = input_153_pad_0, x = x_109_cast_fp16)[name = string("input_153_cast_fp16")];
+            string x_111_pad_type_0 = const()[name = string("x_111_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_111_dilations_0 = const()[name = string("x_111_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_111_strides_0 = const()[name = string("x_111_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_111_pad_0 = const()[name = string("x_111_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_111_groups_0 = const()[name = string("x_111_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 7]> decoder_decoder_1_block_4_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 768, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101144512))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105273344))))[name = string("decoder_decoder_1_block_4_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105273920)))];
+            tensor<fp16, [1, 768, 1, 152]> x_111_cast_fp16 = conv(bias = decoder_decoder_1_block_4_conv1_conv_bias_to_fp16, dilations = x_111_dilations_0, groups = x_111_groups_0, pad = x_111_pad_0, pad_type = x_111_pad_type_0, strides = x_111_strides_0, weight = decoder_decoder_1_block_4_conv1_conv_weight_to_fp16_palettized, x = input_153_cast_fp16)[name = string("x_111_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_13_to_fp16 = const()[name = string("alpha_13_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105275520)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2478_cast_fp16 = mul(x = x_111_cast_fp16, y = alpha_13_to_fp16)[name = string("op_2478_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_13_cast_fp16 = sin(x = var_2478_cast_fp16)[name = string("sin_val_13_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2485_cast_fp16 = mul(x = sin_val_13_cast_fp16, y = sin_val_13_cast_fp16)[name = string("op_2485_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2482_to_fp16 = const()[name = string("op_2482_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105277120)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2486_cast_fp16 = mul(x = var_2482_to_fp16, y = var_2485_cast_fp16)[name = string("op_2486_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_113_cast_fp16 = add(x = x_111_cast_fp16, y = var_2486_cast_fp16)[name = string("x_113_cast_fp16")];
+            string x_115_pad_type_0 = const()[name = string("x_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_115_strides_0 = const()[name = string("x_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_115_pad_0 = const()[name = string("x_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_115_dilations_0 = const()[name = string("x_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_115_groups_0 = const()[name = string("x_115_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 1]> decoder_decoder_1_block_4_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105278720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105868608))))[name = string("decoder_decoder_1_block_4_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [768]> decoder_decoder_1_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105869184)))];
+            tensor<fp16, [1, 768, 1, 152]> x_115_cast_fp16 = conv(bias = decoder_decoder_1_block_4_conv2_conv_bias_to_fp16, dilations = x_115_dilations_0, groups = x_115_groups_0, pad = x_115_pad_0, pad_type = x_115_pad_type_0, strides = x_115_strides_0, weight = decoder_decoder_1_block_4_conv2_conv_weight_to_fp16_palettized, x = x_113_cast_fp16)[name = string("x_115_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_117_cast_fp16 = add(x = x_115_cast_fp16, y = x_107_cast_fp16)[name = string("x_117_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_15_to_fp16 = const()[name = string("alpha_15_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105870784)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2517_cast_fp16 = mul(x = x_117_cast_fp16, y = alpha_15_to_fp16)[name = string("op_2517_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_15_cast_fp16 = sin(x = var_2517_cast_fp16)[name = string("sin_val_15_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2524_cast_fp16 = mul(x = sin_val_15_cast_fp16, y = sin_val_15_cast_fp16)[name = string("op_2524_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2521_to_fp16 = const()[name = string("op_2521_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105872384)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2525_cast_fp16 = mul(x = var_2521_to_fp16, y = var_2524_cast_fp16)[name = string("op_2525_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_119_cast_fp16 = add(x = x_117_cast_fp16, y = var_2525_cast_fp16)[name = string("x_119_cast_fp16")];
+            string x_121_pad_type_0 = const()[name = string("x_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_121_strides_0 = const()[name = string("x_121_strides_0"), val = tensor<int32, [2]>([1, 5])];
+            tensor<int32, [4]> x_121_pad_0 = const()[name = string("x_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_121_dilations_0 = const()[name = string("x_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_121_groups_0 = const()[name = string("x_121_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_121_has_output_shape_output_shape_0 = const()[name = string("x_121_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 384, 1, 765])];
+            tensor<fp16, [768, 384, 1, 10]> decoder_decoder_2_block_1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [768, 384, 1, 10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105873984))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108823168))))[name = string("decoder_decoder_2_block_1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108823744)))];
+            tensor<fp16, [1, 384, 1, 765]> x_121_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_2_block_1_conv_bias_to_fp16, dilations = x_121_dilations_0, groups = x_121_groups_0, output_shape = x_121_has_output_shape_output_shape_0, pad = x_121_pad_0, pad_type = x_121_pad_type_0, strides = x_121_strides_0, weight = decoder_decoder_2_block_1_conv_weight_to_fp16_palettized, x = x_119_cast_fp16)[name = string("x_121_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_123_begin_0 = const()[name = string("x_123_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 5])];
+            tensor<int32, [4]> x_123_end_0 = const()[name = string("x_123_end_0"), val = tensor<int32, [4]>([1, 384, 1, 760])];
+            tensor<bool, [4]> x_123_end_mask_0 = const()[name = string("x_123_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 384, 1, 755]> x_123_cast_fp16 = slice_by_index(begin = x_123_begin_0, end = x_123_end_0, end_mask = x_123_end_mask_0, x = x_121_has_output_shape_cast_fp16)[name = string("x_123_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_17_to_fp16 = const()[name = string("alpha_17_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108824576)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2557_cast_fp16 = mul(x = x_123_cast_fp16, y = alpha_17_to_fp16)[name = string("op_2557_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_17_cast_fp16 = sin(x = var_2557_cast_fp16)[name = string("sin_val_17_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2564_cast_fp16 = mul(x = sin_val_17_cast_fp16, y = sin_val_17_cast_fp16)[name = string("op_2564_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2561_to_fp16 = const()[name = string("op_2561_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108825408)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2565_cast_fp16 = mul(x = var_2561_to_fp16, y = var_2564_cast_fp16)[name = string("op_2565_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_125_cast_fp16 = add(x = x_123_cast_fp16, y = var_2565_cast_fp16)[name = string("x_125_cast_fp16")];
+            tensor<int32, [8]> input_157_pad_0 = const()[name = string("input_157_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_157_mode_0 = const()[name = string("input_157_mode_0"), val = string("constant")];
+            fp16 const_98_to_fp16 = const()[name = string("const_98_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 384, 1, 761]> input_157_cast_fp16 = pad(constant_val = const_98_to_fp16, mode = input_157_mode_0, pad = input_157_pad_0, x = x_125_cast_fp16)[name = string("input_157_cast_fp16")];
+            string x_127_pad_type_0 = const()[name = string("x_127_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_127_strides_0 = const()[name = string("x_127_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_127_pad_0 = const()[name = string("x_127_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_127_dilations_0 = const()[name = string("x_127_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_127_groups_0 = const()[name = string("x_127_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 7]> decoder_decoder_2_block_2_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 384, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(108826240))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109858496))))[name = string("decoder_decoder_2_block_2_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109859072)))];
+            tensor<fp16, [1, 384, 1, 755]> x_127_cast_fp16 = conv(bias = decoder_decoder_2_block_2_conv1_conv_bias_to_fp16, dilations = x_127_dilations_0, groups = x_127_groups_0, pad = x_127_pad_0, pad_type = x_127_pad_type_0, strides = x_127_strides_0, weight = decoder_decoder_2_block_2_conv1_conv_weight_to_fp16_palettized, x = input_157_cast_fp16)[name = string("x_127_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_19_to_fp16 = const()[name = string("alpha_19_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109859904)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2585_cast_fp16 = mul(x = x_127_cast_fp16, y = alpha_19_to_fp16)[name = string("op_2585_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_19_cast_fp16 = sin(x = var_2585_cast_fp16)[name = string("sin_val_19_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2592_cast_fp16 = mul(x = sin_val_19_cast_fp16, y = sin_val_19_cast_fp16)[name = string("op_2592_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2589_to_fp16 = const()[name = string("op_2589_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109860736)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2593_cast_fp16 = mul(x = var_2589_to_fp16, y = var_2592_cast_fp16)[name = string("op_2593_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_129_cast_fp16 = add(x = x_127_cast_fp16, y = var_2593_cast_fp16)[name = string("x_129_cast_fp16")];
+            string x_131_pad_type_0 = const()[name = string("x_131_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_131_strides_0 = const()[name = string("x_131_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_131_pad_0 = const()[name = string("x_131_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_131_dilations_0 = const()[name = string("x_131_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_131_groups_0 = const()[name = string("x_131_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 1]> decoder_decoder_2_block_2_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109861568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110009088))))[name = string("decoder_decoder_2_block_2_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110009664)))];
+            tensor<fp16, [1, 384, 1, 755]> x_131_cast_fp16 = conv(bias = decoder_decoder_2_block_2_conv2_conv_bias_to_fp16, dilations = x_131_dilations_0, groups = x_131_groups_0, pad = x_131_pad_0, pad_type = x_131_pad_type_0, strides = x_131_strides_0, weight = decoder_decoder_2_block_2_conv2_conv_weight_to_fp16_palettized, x = x_129_cast_fp16)[name = string("x_131_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_133_cast_fp16 = add(x = x_131_cast_fp16, y = x_123_cast_fp16)[name = string("x_133_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_21_to_fp16 = const()[name = string("alpha_21_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110010496)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2618_cast_fp16 = mul(x = x_133_cast_fp16, y = alpha_21_to_fp16)[name = string("op_2618_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_21_cast_fp16 = sin(x = var_2618_cast_fp16)[name = string("sin_val_21_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2625_cast_fp16 = mul(x = sin_val_21_cast_fp16, y = sin_val_21_cast_fp16)[name = string("op_2625_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2622_to_fp16 = const()[name = string("op_2622_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110011328)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2626_cast_fp16 = mul(x = var_2622_to_fp16, y = var_2625_cast_fp16)[name = string("op_2626_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_135_cast_fp16 = add(x = x_133_cast_fp16, y = var_2626_cast_fp16)[name = string("x_135_cast_fp16")];
+            tensor<int32, [8]> input_161_pad_0 = const()[name = string("input_161_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_161_mode_0 = const()[name = string("input_161_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 384, 1, 773]> input_161_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = input_161_mode_0, pad = input_161_pad_0, x = x_135_cast_fp16)[name = string("input_161_cast_fp16")];
+            string x_137_pad_type_0 = const()[name = string("x_137_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_137_dilations_0 = const()[name = string("x_137_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_137_strides_0 = const()[name = string("x_137_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_137_pad_0 = const()[name = string("x_137_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_137_groups_0 = const()[name = string("x_137_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 7]> decoder_decoder_2_block_3_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 384, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(110012160))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111044416))))[name = string("decoder_decoder_2_block_3_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111044992)))];
+            tensor<fp16, [1, 384, 1, 755]> x_137_cast_fp16 = conv(bias = decoder_decoder_2_block_3_conv1_conv_bias_to_fp16, dilations = x_137_dilations_0, groups = x_137_groups_0, pad = x_137_pad_0, pad_type = x_137_pad_type_0, strides = x_137_strides_0, weight = decoder_decoder_2_block_3_conv1_conv_weight_to_fp16_palettized, x = input_161_cast_fp16)[name = string("x_137_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_23_to_fp16 = const()[name = string("alpha_23_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111045824)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2646_cast_fp16 = mul(x = x_137_cast_fp16, y = alpha_23_to_fp16)[name = string("op_2646_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_23_cast_fp16 = sin(x = var_2646_cast_fp16)[name = string("sin_val_23_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2653_cast_fp16 = mul(x = sin_val_23_cast_fp16, y = sin_val_23_cast_fp16)[name = string("op_2653_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2650_to_fp16 = const()[name = string("op_2650_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111046656)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2654_cast_fp16 = mul(x = var_2650_to_fp16, y = var_2653_cast_fp16)[name = string("op_2654_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_139_cast_fp16 = add(x = x_137_cast_fp16, y = var_2654_cast_fp16)[name = string("x_139_cast_fp16")];
+            string x_141_pad_type_0 = const()[name = string("x_141_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_141_strides_0 = const()[name = string("x_141_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_141_pad_0 = const()[name = string("x_141_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_141_dilations_0 = const()[name = string("x_141_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_141_groups_0 = const()[name = string("x_141_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 1]> decoder_decoder_2_block_3_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111047488))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111195008))))[name = string("decoder_decoder_2_block_3_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111195584)))];
+            tensor<fp16, [1, 384, 1, 755]> x_141_cast_fp16 = conv(bias = decoder_decoder_2_block_3_conv2_conv_bias_to_fp16, dilations = x_141_dilations_0, groups = x_141_groups_0, pad = x_141_pad_0, pad_type = x_141_pad_type_0, strides = x_141_strides_0, weight = decoder_decoder_2_block_3_conv2_conv_weight_to_fp16_palettized, x = x_139_cast_fp16)[name = string("x_141_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_143_cast_fp16 = add(x = x_141_cast_fp16, y = x_133_cast_fp16)[name = string("x_143_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_25_to_fp16 = const()[name = string("alpha_25_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111196416)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2679_cast_fp16 = mul(x = x_143_cast_fp16, y = alpha_25_to_fp16)[name = string("op_2679_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_25_cast_fp16 = sin(x = var_2679_cast_fp16)[name = string("sin_val_25_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2686_cast_fp16 = mul(x = sin_val_25_cast_fp16, y = sin_val_25_cast_fp16)[name = string("op_2686_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2683_to_fp16 = const()[name = string("op_2683_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111197248)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2687_cast_fp16 = mul(x = var_2683_to_fp16, y = var_2686_cast_fp16)[name = string("op_2687_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_145_cast_fp16 = add(x = x_143_cast_fp16, y = var_2687_cast_fp16)[name = string("x_145_cast_fp16")];
+            tensor<int32, [8]> input_165_pad_0 = const()[name = string("input_165_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_165_mode_0 = const()[name = string("input_165_mode_0"), val = string("constant")];
+            fp16 const_102_to_fp16 = const()[name = string("const_102_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 384, 1, 809]> input_165_cast_fp16 = pad(constant_val = const_102_to_fp16, mode = input_165_mode_0, pad = input_165_pad_0, x = x_145_cast_fp16)[name = string("input_165_cast_fp16")];
+            string x_147_pad_type_0 = const()[name = string("x_147_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_147_dilations_0 = const()[name = string("x_147_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_147_strides_0 = const()[name = string("x_147_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_147_pad_0 = const()[name = string("x_147_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_147_groups_0 = const()[name = string("x_147_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 7]> decoder_decoder_2_block_4_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 384, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111198080))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112230336))))[name = string("decoder_decoder_2_block_4_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112230912)))];
+            tensor<fp16, [1, 384, 1, 755]> x_147_cast_fp16 = conv(bias = decoder_decoder_2_block_4_conv1_conv_bias_to_fp16, dilations = x_147_dilations_0, groups = x_147_groups_0, pad = x_147_pad_0, pad_type = x_147_pad_type_0, strides = x_147_strides_0, weight = decoder_decoder_2_block_4_conv1_conv_weight_to_fp16_palettized, x = input_165_cast_fp16)[name = string("x_147_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_27_to_fp16 = const()[name = string("alpha_27_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112231744)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2707_cast_fp16 = mul(x = x_147_cast_fp16, y = alpha_27_to_fp16)[name = string("op_2707_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_27_cast_fp16 = sin(x = var_2707_cast_fp16)[name = string("sin_val_27_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2714_cast_fp16 = mul(x = sin_val_27_cast_fp16, y = sin_val_27_cast_fp16)[name = string("op_2714_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2711_to_fp16 = const()[name = string("op_2711_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112232576)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2715_cast_fp16 = mul(x = var_2711_to_fp16, y = var_2714_cast_fp16)[name = string("op_2715_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_149_cast_fp16 = add(x = x_147_cast_fp16, y = var_2715_cast_fp16)[name = string("x_149_cast_fp16")];
+            string x_151_pad_type_0 = const()[name = string("x_151_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_151_strides_0 = const()[name = string("x_151_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_151_pad_0 = const()[name = string("x_151_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_151_dilations_0 = const()[name = string("x_151_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_151_groups_0 = const()[name = string("x_151_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 1]> decoder_decoder_2_block_4_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112233408))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112380928))))[name = string("decoder_decoder_2_block_4_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [384]> decoder_decoder_2_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112381504)))];
+            tensor<fp16, [1, 384, 1, 755]> x_151_cast_fp16 = conv(bias = decoder_decoder_2_block_4_conv2_conv_bias_to_fp16, dilations = x_151_dilations_0, groups = x_151_groups_0, pad = x_151_pad_0, pad_type = x_151_pad_type_0, strides = x_151_strides_0, weight = decoder_decoder_2_block_4_conv2_conv_weight_to_fp16_palettized, x = x_149_cast_fp16)[name = string("x_151_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_153_cast_fp16 = add(x = x_151_cast_fp16, y = x_143_cast_fp16)[name = string("x_153_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_29_to_fp16 = const()[name = string("alpha_29_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112382336)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2746_cast_fp16 = mul(x = x_153_cast_fp16, y = alpha_29_to_fp16)[name = string("op_2746_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_29_cast_fp16 = sin(x = var_2746_cast_fp16)[name = string("sin_val_29_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2753_cast_fp16 = mul(x = sin_val_29_cast_fp16, y = sin_val_29_cast_fp16)[name = string("op_2753_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2750_to_fp16 = const()[name = string("op_2750_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112383168)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2754_cast_fp16 = mul(x = var_2750_to_fp16, y = var_2753_cast_fp16)[name = string("op_2754_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_155_cast_fp16 = add(x = x_153_cast_fp16, y = var_2754_cast_fp16)[name = string("x_155_cast_fp16")];
+            string x_157_pad_type_0 = const()[name = string("x_157_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_157_strides_0 = const()[name = string("x_157_strides_0"), val = tensor<int32, [2]>([1, 4])];
+            tensor<int32, [4]> x_157_pad_0 = const()[name = string("x_157_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_157_dilations_0 = const()[name = string("x_157_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_157_groups_0 = const()[name = string("x_157_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_157_has_output_shape_output_shape_0 = const()[name = string("x_157_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 192, 1, 3024])];
+            tensor<fp16, [384, 192, 1, 8]> decoder_decoder_3_block_1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [384, 192, 1, 8]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112384000))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112973888))))[name = string("decoder_decoder_3_block_1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112974464)))];
+            tensor<fp16, [1, 192, 1, 3024]> x_157_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_3_block_1_conv_bias_to_fp16, dilations = x_157_dilations_0, groups = x_157_groups_0, output_shape = x_157_has_output_shape_output_shape_0, pad = x_157_pad_0, pad_type = x_157_pad_type_0, strides = x_157_strides_0, weight = decoder_decoder_3_block_1_conv_weight_to_fp16_palettized, x = x_155_cast_fp16)[name = string("x_157_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_159_begin_0 = const()[name = string("x_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 4])];
+            tensor<int32, [4]> x_159_end_0 = const()[name = string("x_159_end_0"), val = tensor<int32, [4]>([1, 192, 1, 3020])];
+            tensor<bool, [4]> x_159_end_mask_0 = const()[name = string("x_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 192, 1, 3016]> x_159_cast_fp16 = slice_by_index(begin = x_159_begin_0, end = x_159_end_0, end_mask = x_159_end_mask_0, x = x_157_has_output_shape_cast_fp16)[name = string("x_159_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_31_to_fp16 = const()[name = string("alpha_31_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112974912)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2786_cast_fp16 = mul(x = x_159_cast_fp16, y = alpha_31_to_fp16)[name = string("op_2786_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_31_cast_fp16 = sin(x = var_2786_cast_fp16)[name = string("sin_val_31_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2793_cast_fp16 = mul(x = sin_val_31_cast_fp16, y = sin_val_31_cast_fp16)[name = string("op_2793_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2790_to_fp16 = const()[name = string("op_2790_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112975360)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2794_cast_fp16 = mul(x = var_2790_to_fp16, y = var_2793_cast_fp16)[name = string("op_2794_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_161_cast_fp16 = add(x = x_159_cast_fp16, y = var_2794_cast_fp16)[name = string("x_161_cast_fp16")];
+            tensor<int32, [8]> input_169_pad_0 = const()[name = string("input_169_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_169_mode_0 = const()[name = string("input_169_mode_0"), val = string("constant")];
+            fp16 const_105_to_fp16 = const()[name = string("const_105_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 192, 1, 3022]> input_169_cast_fp16 = pad(constant_val = const_105_to_fp16, mode = input_169_mode_0, pad = input_169_pad_0, x = x_161_cast_fp16)[name = string("input_169_cast_fp16")];
+            string x_163_pad_type_0 = const()[name = string("x_163_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_163_strides_0 = const()[name = string("x_163_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_163_pad_0 = const()[name = string("x_163_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_163_dilations_0 = const()[name = string("x_163_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_163_groups_0 = const()[name = string("x_163_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 7]> decoder_decoder_3_block_2_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 192, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112975808))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113233920))))[name = string("decoder_decoder_3_block_2_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113234496)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_163_cast_fp16 = conv(bias = decoder_decoder_3_block_2_conv1_conv_bias_to_fp16, dilations = x_163_dilations_0, groups = x_163_groups_0, pad = x_163_pad_0, pad_type = x_163_pad_type_0, strides = x_163_strides_0, weight = decoder_decoder_3_block_2_conv1_conv_weight_to_fp16_palettized, x = input_169_cast_fp16)[name = string("x_163_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_33_to_fp16 = const()[name = string("alpha_33_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113234944)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2814_cast_fp16 = mul(x = x_163_cast_fp16, y = alpha_33_to_fp16)[name = string("op_2814_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_33_cast_fp16 = sin(x = var_2814_cast_fp16)[name = string("sin_val_33_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2821_cast_fp16 = mul(x = sin_val_33_cast_fp16, y = sin_val_33_cast_fp16)[name = string("op_2821_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2818_to_fp16 = const()[name = string("op_2818_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113235392)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2822_cast_fp16 = mul(x = var_2818_to_fp16, y = var_2821_cast_fp16)[name = string("op_2822_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_165_cast_fp16 = add(x = x_163_cast_fp16, y = var_2822_cast_fp16)[name = string("x_165_cast_fp16")];
+            string x_167_pad_type_0 = const()[name = string("x_167_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_167_strides_0 = const()[name = string("x_167_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_167_pad_0 = const()[name = string("x_167_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_167_dilations_0 = const()[name = string("x_167_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_167_groups_0 = const()[name = string("x_167_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 1]> decoder_decoder_3_block_2_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113235840))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113272768))))[name = string("decoder_decoder_3_block_2_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113273344)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_167_cast_fp16 = conv(bias = decoder_decoder_3_block_2_conv2_conv_bias_to_fp16, dilations = x_167_dilations_0, groups = x_167_groups_0, pad = x_167_pad_0, pad_type = x_167_pad_type_0, strides = x_167_strides_0, weight = decoder_decoder_3_block_2_conv2_conv_weight_to_fp16_palettized, x = x_165_cast_fp16)[name = string("x_167_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_169_cast_fp16 = add(x = x_167_cast_fp16, y = x_159_cast_fp16)[name = string("x_169_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_35_to_fp16 = const()[name = string("alpha_35_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113273792)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2847_cast_fp16 = mul(x = x_169_cast_fp16, y = alpha_35_to_fp16)[name = string("op_2847_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_35_cast_fp16 = sin(x = var_2847_cast_fp16)[name = string("sin_val_35_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2854_cast_fp16 = mul(x = sin_val_35_cast_fp16, y = sin_val_35_cast_fp16)[name = string("op_2854_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2851_to_fp16 = const()[name = string("op_2851_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113274240)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2855_cast_fp16 = mul(x = var_2851_to_fp16, y = var_2854_cast_fp16)[name = string("op_2855_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_171_cast_fp16 = add(x = x_169_cast_fp16, y = var_2855_cast_fp16)[name = string("x_171_cast_fp16")];
+            tensor<int32, [8]> input_173_pad_0 = const()[name = string("input_173_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_173_mode_0 = const()[name = string("input_173_mode_0"), val = string("constant")];
+            fp16 const_107_to_fp16 = const()[name = string("const_107_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 192, 1, 3034]> input_173_cast_fp16 = pad(constant_val = const_107_to_fp16, mode = input_173_mode_0, pad = input_173_pad_0, x = x_171_cast_fp16)[name = string("input_173_cast_fp16")];
+            string x_173_pad_type_0 = const()[name = string("x_173_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_173_dilations_0 = const()[name = string("x_173_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_173_strides_0 = const()[name = string("x_173_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_173_pad_0 = const()[name = string("x_173_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_173_groups_0 = const()[name = string("x_173_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 7]> decoder_decoder_3_block_3_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 192, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113274688))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113532800))))[name = string("decoder_decoder_3_block_3_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113533376)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_173_cast_fp16 = conv(bias = decoder_decoder_3_block_3_conv1_conv_bias_to_fp16, dilations = x_173_dilations_0, groups = x_173_groups_0, pad = x_173_pad_0, pad_type = x_173_pad_type_0, strides = x_173_strides_0, weight = decoder_decoder_3_block_3_conv1_conv_weight_to_fp16_palettized, x = input_173_cast_fp16)[name = string("x_173_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_37_to_fp16 = const()[name = string("alpha_37_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113533824)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2875_cast_fp16 = mul(x = x_173_cast_fp16, y = alpha_37_to_fp16)[name = string("op_2875_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_37_cast_fp16 = sin(x = var_2875_cast_fp16)[name = string("sin_val_37_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2882_cast_fp16 = mul(x = sin_val_37_cast_fp16, y = sin_val_37_cast_fp16)[name = string("op_2882_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2879_to_fp16 = const()[name = string("op_2879_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113534272)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2883_cast_fp16 = mul(x = var_2879_to_fp16, y = var_2882_cast_fp16)[name = string("op_2883_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_175_cast_fp16 = add(x = x_173_cast_fp16, y = var_2883_cast_fp16)[name = string("x_175_cast_fp16")];
+            string x_177_pad_type_0 = const()[name = string("x_177_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_177_strides_0 = const()[name = string("x_177_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_177_pad_0 = const()[name = string("x_177_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_177_dilations_0 = const()[name = string("x_177_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_177_groups_0 = const()[name = string("x_177_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 1]> decoder_decoder_3_block_3_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113534720))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113571648))))[name = string("decoder_decoder_3_block_3_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113572224)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_177_cast_fp16 = conv(bias = decoder_decoder_3_block_3_conv2_conv_bias_to_fp16, dilations = x_177_dilations_0, groups = x_177_groups_0, pad = x_177_pad_0, pad_type = x_177_pad_type_0, strides = x_177_strides_0, weight = decoder_decoder_3_block_3_conv2_conv_weight_to_fp16_palettized, x = x_175_cast_fp16)[name = string("x_177_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_179_cast_fp16 = add(x = x_177_cast_fp16, y = x_169_cast_fp16)[name = string("x_179_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_39_to_fp16 = const()[name = string("alpha_39_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113572672)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2908_cast_fp16 = mul(x = x_179_cast_fp16, y = alpha_39_to_fp16)[name = string("op_2908_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_39_cast_fp16 = sin(x = var_2908_cast_fp16)[name = string("sin_val_39_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2915_cast_fp16 = mul(x = sin_val_39_cast_fp16, y = sin_val_39_cast_fp16)[name = string("op_2915_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2912_to_fp16 = const()[name = string("op_2912_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113573120)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2916_cast_fp16 = mul(x = var_2912_to_fp16, y = var_2915_cast_fp16)[name = string("op_2916_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_181_cast_fp16 = add(x = x_179_cast_fp16, y = var_2916_cast_fp16)[name = string("x_181_cast_fp16")];
+            tensor<int32, [8]> input_177_pad_0 = const()[name = string("input_177_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_177_mode_0 = const()[name = string("input_177_mode_0"), val = string("constant")];
+            fp16 const_109_to_fp16 = const()[name = string("const_109_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 192, 1, 3070]> input_177_cast_fp16 = pad(constant_val = const_109_to_fp16, mode = input_177_mode_0, pad = input_177_pad_0, x = x_181_cast_fp16)[name = string("input_177_cast_fp16")];
+            string x_183_pad_type_0 = const()[name = string("x_183_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_183_dilations_0 = const()[name = string("x_183_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_183_strides_0 = const()[name = string("x_183_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_183_pad_0 = const()[name = string("x_183_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_183_groups_0 = const()[name = string("x_183_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 7]> decoder_decoder_3_block_4_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 192, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113573568))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113831680))))[name = string("decoder_decoder_3_block_4_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113832256)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_183_cast_fp16 = conv(bias = decoder_decoder_3_block_4_conv1_conv_bias_to_fp16, dilations = x_183_dilations_0, groups = x_183_groups_0, pad = x_183_pad_0, pad_type = x_183_pad_type_0, strides = x_183_strides_0, weight = decoder_decoder_3_block_4_conv1_conv_weight_to_fp16_palettized, x = input_177_cast_fp16)[name = string("x_183_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_41_to_fp16 = const()[name = string("alpha_41_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113832704)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2936_cast_fp16 = mul(x = x_183_cast_fp16, y = alpha_41_to_fp16)[name = string("op_2936_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_41_cast_fp16 = sin(x = var_2936_cast_fp16)[name = string("sin_val_41_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2943_cast_fp16 = mul(x = sin_val_41_cast_fp16, y = sin_val_41_cast_fp16)[name = string("op_2943_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2940_to_fp16 = const()[name = string("op_2940_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113833152)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2944_cast_fp16 = mul(x = var_2940_to_fp16, y = var_2943_cast_fp16)[name = string("op_2944_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_185_cast_fp16 = add(x = x_183_cast_fp16, y = var_2944_cast_fp16)[name = string("x_185_cast_fp16")];
+            string x_187_pad_type_0 = const()[name = string("x_187_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_187_strides_0 = const()[name = string("x_187_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_187_pad_0 = const()[name = string("x_187_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_187_dilations_0 = const()[name = string("x_187_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_187_groups_0 = const()[name = string("x_187_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 1]> decoder_decoder_3_block_4_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113833600))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113870528))))[name = string("decoder_decoder_3_block_4_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [192]> decoder_decoder_3_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113871104)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_187_cast_fp16 = conv(bias = decoder_decoder_3_block_4_conv2_conv_bias_to_fp16, dilations = x_187_dilations_0, groups = x_187_groups_0, pad = x_187_pad_0, pad_type = x_187_pad_type_0, strides = x_187_strides_0, weight = decoder_decoder_3_block_4_conv2_conv_weight_to_fp16_palettized, x = x_185_cast_fp16)[name = string("x_187_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_189_cast_fp16 = add(x = x_187_cast_fp16, y = x_179_cast_fp16)[name = string("x_189_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_43_to_fp16 = const()[name = string("alpha_43_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113871552)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2975_cast_fp16 = mul(x = x_189_cast_fp16, y = alpha_43_to_fp16)[name = string("op_2975_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_43_cast_fp16 = sin(x = var_2975_cast_fp16)[name = string("sin_val_43_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2982_cast_fp16 = mul(x = sin_val_43_cast_fp16, y = sin_val_43_cast_fp16)[name = string("op_2982_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2979_to_fp16 = const()[name = string("op_2979_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113872000)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2983_cast_fp16 = mul(x = var_2979_to_fp16, y = var_2982_cast_fp16)[name = string("op_2983_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_191_cast_fp16 = add(x = x_189_cast_fp16, y = var_2983_cast_fp16)[name = string("x_191_cast_fp16")];
+            string x_193_pad_type_0 = const()[name = string("x_193_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_193_strides_0 = const()[name = string("x_193_strides_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [4]> x_193_pad_0 = const()[name = string("x_193_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_193_dilations_0 = const()[name = string("x_193_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_193_groups_0 = const()[name = string("x_193_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_193_has_output_shape_output_shape_0 = const()[name = string("x_193_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 96, 1, 9051])];
+            tensor<fp16, [192, 96, 1, 6]> decoder_decoder_4_block_1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [192, 96, 1, 6]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113872448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113983104))))[name = string("decoder_decoder_4_block_1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113983680)))];
+            tensor<fp16, [1, 96, 1, 9051]> x_193_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_4_block_1_conv_bias_to_fp16, dilations = x_193_dilations_0, groups = x_193_groups_0, output_shape = x_193_has_output_shape_output_shape_0, pad = x_193_pad_0, pad_type = x_193_pad_type_0, strides = x_193_strides_0, weight = decoder_decoder_4_block_1_conv_weight_to_fp16_palettized, x = x_191_cast_fp16)[name = string("x_193_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_195_begin_0 = const()[name = string("x_195_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3])];
+            tensor<int32, [4]> x_195_end_0 = const()[name = string("x_195_end_0"), val = tensor<int32, [4]>([1, 96, 1, 9048])];
+            tensor<bool, [4]> x_195_end_mask_0 = const()[name = string("x_195_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 96, 1, 9045]> x_195_cast_fp16 = slice_by_index(begin = x_195_begin_0, end = x_195_end_0, end_mask = x_195_end_mask_0, x = x_193_has_output_shape_cast_fp16)[name = string("x_195_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_45_to_fp16 = const()[name = string("alpha_45_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113983936)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3015_cast_fp16 = mul(x = x_195_cast_fp16, y = alpha_45_to_fp16)[name = string("op_3015_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_45_cast_fp16 = sin(x = var_3015_cast_fp16)[name = string("sin_val_45_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3022_cast_fp16 = mul(x = sin_val_45_cast_fp16, y = sin_val_45_cast_fp16)[name = string("op_3022_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3019_to_fp16 = const()[name = string("op_3019_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113984192)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3023_cast_fp16 = mul(x = var_3019_to_fp16, y = var_3022_cast_fp16)[name = string("op_3023_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_197_cast_fp16 = add(x = x_195_cast_fp16, y = var_3023_cast_fp16)[name = string("x_197_cast_fp16")];
+            tensor<int32, [8]> input_181_pad_0 = const()[name = string("input_181_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_181_mode_0 = const()[name = string("input_181_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9051]> input_181_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = input_181_mode_0, pad = input_181_pad_0, x = x_197_cast_fp16)[name = string("input_181_cast_fp16")];
+            string x_199_pad_type_0 = const()[name = string("x_199_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_199_strides_0 = const()[name = string("x_199_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_199_pad_0 = const()[name = string("x_199_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_199_dilations_0 = const()[name = string("x_199_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_199_groups_0 = const()[name = string("x_199_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 7]> decoder_decoder_4_block_2_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [96, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113984448))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114049024))))[name = string("decoder_decoder_4_block_2_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114049600)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_199_cast_fp16 = conv(bias = decoder_decoder_4_block_2_conv1_conv_bias_to_fp16, dilations = x_199_dilations_0, groups = x_199_groups_0, pad = x_199_pad_0, pad_type = x_199_pad_type_0, strides = x_199_strides_0, weight = decoder_decoder_4_block_2_conv1_conv_weight_to_fp16_palettized, x = input_181_cast_fp16)[name = string("x_199_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_47_to_fp16 = const()[name = string("alpha_47_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114049856)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3043_cast_fp16 = mul(x = x_199_cast_fp16, y = alpha_47_to_fp16)[name = string("op_3043_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_47_cast_fp16 = sin(x = var_3043_cast_fp16)[name = string("sin_val_47_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3050_cast_fp16 = mul(x = sin_val_47_cast_fp16, y = sin_val_47_cast_fp16)[name = string("op_3050_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3047_to_fp16 = const()[name = string("op_3047_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114050112)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3051_cast_fp16 = mul(x = var_3047_to_fp16, y = var_3050_cast_fp16)[name = string("op_3051_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_201_cast_fp16 = add(x = x_199_cast_fp16, y = var_3051_cast_fp16)[name = string("x_201_cast_fp16")];
+            string x_203_pad_type_0 = const()[name = string("x_203_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_203_strides_0 = const()[name = string("x_203_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_203_pad_0 = const()[name = string("x_203_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_203_dilations_0 = const()[name = string("x_203_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_203_groups_0 = const()[name = string("x_203_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 1]> decoder_decoder_4_block_2_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [96, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114050368))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114059648))))[name = string("decoder_decoder_4_block_2_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114060224)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_203_cast_fp16 = conv(bias = decoder_decoder_4_block_2_conv2_conv_bias_to_fp16, dilations = x_203_dilations_0, groups = x_203_groups_0, pad = x_203_pad_0, pad_type = x_203_pad_type_0, strides = x_203_strides_0, weight = decoder_decoder_4_block_2_conv2_conv_weight_to_fp16_palettized, x = x_201_cast_fp16)[name = string("x_203_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_205_cast_fp16 = add(x = x_203_cast_fp16, y = x_195_cast_fp16)[name = string("x_205_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_49_to_fp16 = const()[name = string("alpha_49_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114060480)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3076_cast_fp16 = mul(x = x_205_cast_fp16, y = alpha_49_to_fp16)[name = string("op_3076_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_49_cast_fp16 = sin(x = var_3076_cast_fp16)[name = string("sin_val_49_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3083_cast_fp16 = mul(x = sin_val_49_cast_fp16, y = sin_val_49_cast_fp16)[name = string("op_3083_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3080_to_fp16 = const()[name = string("op_3080_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114060736)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3084_cast_fp16 = mul(x = var_3080_to_fp16, y = var_3083_cast_fp16)[name = string("op_3084_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_207_cast_fp16 = add(x = x_205_cast_fp16, y = var_3084_cast_fp16)[name = string("x_207_cast_fp16")];
+            tensor<int32, [8]> input_185_pad_0 = const()[name = string("input_185_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_185_mode_0 = const()[name = string("input_185_mode_0"), val = string("constant")];
+            fp16 const_114_to_fp16 = const()[name = string("const_114_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9063]> input_185_cast_fp16 = pad(constant_val = const_114_to_fp16, mode = input_185_mode_0, pad = input_185_pad_0, x = x_207_cast_fp16)[name = string("input_185_cast_fp16")];
+            string x_209_pad_type_0 = const()[name = string("x_209_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_209_dilations_0 = const()[name = string("x_209_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_209_strides_0 = const()[name = string("x_209_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_209_pad_0 = const()[name = string("x_209_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_209_groups_0 = const()[name = string("x_209_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 7]> decoder_decoder_4_block_3_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [96, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114060992))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114125568))))[name = string("decoder_decoder_4_block_3_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114126144)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_209_cast_fp16 = conv(bias = decoder_decoder_4_block_3_conv1_conv_bias_to_fp16, dilations = x_209_dilations_0, groups = x_209_groups_0, pad = x_209_pad_0, pad_type = x_209_pad_type_0, strides = x_209_strides_0, weight = decoder_decoder_4_block_3_conv1_conv_weight_to_fp16_palettized, x = input_185_cast_fp16)[name = string("x_209_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_51_to_fp16 = const()[name = string("alpha_51_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114126400)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3104_cast_fp16 = mul(x = x_209_cast_fp16, y = alpha_51_to_fp16)[name = string("op_3104_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_51_cast_fp16 = sin(x = var_3104_cast_fp16)[name = string("sin_val_51_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3111_cast_fp16 = mul(x = sin_val_51_cast_fp16, y = sin_val_51_cast_fp16)[name = string("op_3111_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3108_to_fp16 = const()[name = string("op_3108_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114126656)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3112_cast_fp16 = mul(x = var_3108_to_fp16, y = var_3111_cast_fp16)[name = string("op_3112_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_211_cast_fp16 = add(x = x_209_cast_fp16, y = var_3112_cast_fp16)[name = string("x_211_cast_fp16")];
+            string x_213_pad_type_0 = const()[name = string("x_213_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_213_strides_0 = const()[name = string("x_213_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_213_pad_0 = const()[name = string("x_213_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_213_dilations_0 = const()[name = string("x_213_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_213_groups_0 = const()[name = string("x_213_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 1]> decoder_decoder_4_block_3_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [96, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114126912))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114136192))))[name = string("decoder_decoder_4_block_3_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114136768)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_213_cast_fp16 = conv(bias = decoder_decoder_4_block_3_conv2_conv_bias_to_fp16, dilations = x_213_dilations_0, groups = x_213_groups_0, pad = x_213_pad_0, pad_type = x_213_pad_type_0, strides = x_213_strides_0, weight = decoder_decoder_4_block_3_conv2_conv_weight_to_fp16_palettized, x = x_211_cast_fp16)[name = string("x_213_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_215_cast_fp16 = add(x = x_213_cast_fp16, y = x_205_cast_fp16)[name = string("x_215_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_53_to_fp16 = const()[name = string("alpha_53_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114137024)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3137_cast_fp16 = mul(x = x_215_cast_fp16, y = alpha_53_to_fp16)[name = string("op_3137_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_53_cast_fp16 = sin(x = var_3137_cast_fp16)[name = string("sin_val_53_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3144_cast_fp16 = mul(x = sin_val_53_cast_fp16, y = sin_val_53_cast_fp16)[name = string("op_3144_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3141_to_fp16 = const()[name = string("op_3141_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114137280)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3145_cast_fp16 = mul(x = var_3141_to_fp16, y = var_3144_cast_fp16)[name = string("op_3145_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_217_cast_fp16 = add(x = x_215_cast_fp16, y = var_3145_cast_fp16)[name = string("x_217_cast_fp16")];
+            tensor<int32, [8]> input_189_pad_0 = const()[name = string("input_189_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_189_mode_0 = const()[name = string("input_189_mode_0"), val = string("constant")];
+            fp16 const_116_to_fp16 = const()[name = string("const_116_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9099]> input_189_cast_fp16 = pad(constant_val = const_116_to_fp16, mode = input_189_mode_0, pad = input_189_pad_0, x = x_217_cast_fp16)[name = string("input_189_cast_fp16")];
+            string x_219_pad_type_0 = const()[name = string("x_219_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_219_dilations_0 = const()[name = string("x_219_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_219_strides_0 = const()[name = string("x_219_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_219_pad_0 = const()[name = string("x_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_219_groups_0 = const()[name = string("x_219_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 7]> decoder_decoder_4_block_4_conv1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [96, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114137536))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114202112))))[name = string("decoder_decoder_4_block_4_conv1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114202688)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_219_cast_fp16 = conv(bias = decoder_decoder_4_block_4_conv1_conv_bias_to_fp16, dilations = x_219_dilations_0, groups = x_219_groups_0, pad = x_219_pad_0, pad_type = x_219_pad_type_0, strides = x_219_strides_0, weight = decoder_decoder_4_block_4_conv1_conv_weight_to_fp16_palettized, x = input_189_cast_fp16)[name = string("x_219_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_55_to_fp16 = const()[name = string("alpha_55_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114202944)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3165_cast_fp16 = mul(x = x_219_cast_fp16, y = alpha_55_to_fp16)[name = string("op_3165_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_55_cast_fp16 = sin(x = var_3165_cast_fp16)[name = string("sin_val_55_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3172_cast_fp16 = mul(x = sin_val_55_cast_fp16, y = sin_val_55_cast_fp16)[name = string("op_3172_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3169_to_fp16 = const()[name = string("op_3169_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114203200)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3173_cast_fp16 = mul(x = var_3169_to_fp16, y = var_3172_cast_fp16)[name = string("op_3173_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_221_cast_fp16 = add(x = x_219_cast_fp16, y = var_3173_cast_fp16)[name = string("x_221_cast_fp16")];
+            string x_223_pad_type_0 = const()[name = string("x_223_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_223_strides_0 = const()[name = string("x_223_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_223_pad_0 = const()[name = string("x_223_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_223_dilations_0 = const()[name = string("x_223_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_223_groups_0 = const()[name = string("x_223_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 1]> decoder_decoder_4_block_4_conv2_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [96, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114203456))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114212736))))[name = string("decoder_decoder_4_block_4_conv2_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [96]> decoder_decoder_4_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114213312)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_223_cast_fp16 = conv(bias = decoder_decoder_4_block_4_conv2_conv_bias_to_fp16, dilations = x_223_dilations_0, groups = x_223_groups_0, pad = x_223_pad_0, pad_type = x_223_pad_type_0, strides = x_223_strides_0, weight = decoder_decoder_4_block_4_conv2_conv_weight_to_fp16_palettized, x = x_221_cast_fp16)[name = string("x_223_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_225_cast_fp16 = add(x = x_223_cast_fp16, y = x_215_cast_fp16)[name = string("x_225_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_57_to_fp16 = const()[name = string("alpha_57_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114213568)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3194_cast_fp16 = mul(x = x_225_cast_fp16, y = alpha_57_to_fp16)[name = string("op_3194_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_cast_fp16 = sin(x = var_3194_cast_fp16)[name = string("sin_val_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3201_cast_fp16 = mul(x = sin_val_cast_fp16, y = sin_val_cast_fp16)[name = string("op_3201_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3198_to_fp16 = const()[name = string("op_3198_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114213824)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3202_cast_fp16 = mul(x = var_3198_to_fp16, y = var_3201_cast_fp16)[name = string("op_3202_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_cast_fp16 = add(x = x_225_cast_fp16, y = var_3202_cast_fp16)[name = string("x_cast_fp16")];
+            tensor<int32, [8]> input_pad_0 = const()[name = string("input_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_mode_0 = const()[name = string("input_mode_0"), val = string("constant")];
+            fp16 const_118_to_fp16 = const()[name = string("const_118_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9051]> input_cast_fp16 = pad(constant_val = const_118_to_fp16, mode = input_mode_0, pad = input_pad_0, x = x_cast_fp16)[name = string("input_cast_fp16")];
+            string h_1_pad_type_0 = const()[name = string("h_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> h_1_strides_0 = const()[name = string("h_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> h_1_pad_0 = const()[name = string("h_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> h_1_dilations_0 = const()[name = string("h_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 h_1_groups_0 = const()[name = string("h_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 96, 1, 7]> decoder_decoder_6_conv_weight_to_fp16 = const()[name = string("decoder_decoder_6_conv_weight_to_fp16"), val = tensor<fp16, [1, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114214080)))];
+            tensor<fp16, [1]> decoder_decoder_6_conv_bias_to_fp16 = const()[name = string("decoder_decoder_6_conv_bias_to_fp16"), val = tensor<fp16, [1]>([-0x1.1p-19])];
+            tensor<fp16, [1, 1, 1, 9045]> h_1_cast_fp16 = conv(bias = decoder_decoder_6_conv_bias_to_fp16, dilations = h_1_dilations_0, groups = h_1_groups_0, pad = h_1_pad_0, pad_type = h_1_pad_type_0, strides = h_1_strides_0, weight = decoder_decoder_6_conv_weight_to_fp16, x = input_cast_fp16)[name = string("h_1_cast_fp16")];
+            fp16 var_28_promoted_to_fp16 = const()[name = string("op_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            fp16 var_32_promoted_16_to_fp16 = const()[name = string("op_32_promoted_16_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 9045]> clip_16_cast_fp16 = clip(alpha = var_28_promoted_to_fp16, beta = var_32_promoted_16_to_fp16, x = h_1_cast_fp16)[name = string("clip_16_cast_fp16")];
+            tensor<int32, [4]> var_3215_begin_0 = const()[name = string("op_3215_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 7125])];
+            tensor<int32, [4]> var_3215_end_0 = const()[name = string("op_3215_end_0"), val = tensor<int32, [4]>([1, 1, 1, 9045])];
+            tensor<bool, [4]> var_3215_end_mask_0 = const()[name = string("op_3215_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 1, 1920]> audio = slice_by_index(begin = var_3215_begin_0, end = var_3215_end_0, end_mask = var_3215_end_mask_0, x = clip_16_cast_fp16)[name = string("op_3215_cast_fp16")];
+        } -> (audio, key_cache_updates, value_cache_updates, hidden_context_update);
+}
\ No newline at end of file
diff --git a/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/weights/weight.bin b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5e78ed5b441248fa0238b35c0e6d766940201929
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-0.6b-customvoice/W8A16/SpeechDecoder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a9a352536383e01d53d1d0ab227145e6e02880b2281715bc0ff12a44f7be89f
+size 114215488
diff --git a/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/analytics/coremldata.bin b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7115f2918c612d31a0a1172072f602e4281bbc02
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a7c001fcef8399080af242e87d12ad44d7fc37fded8007e7fce65461346a52c
+size 243
diff --git a/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/coremldata.bin b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b83bc843cdefb057776d892cab33100b95b8099
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:554158c2df351252d84d8295d438b004a9bae5e65ae9dcd802f13de13bacd41a
+size 681
diff --git a/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/metadata.json b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5cfa9e1cf120df1a143135303d9a47cebd61cce
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/metadata.json
@@ -0,0 +1,177 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Int32)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 1920)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 1920]",
+        "name" : "audio",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 1]",
+        "name" : "key_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 1]",
+        "name" : "value_cache_updates",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "hidden_context_update",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 26,
+      "Ios18.mul" : 234,
+      "Ios18.softmax" : 8,
+      "Ios18.matmul" : 16,
+      "Ios18.rsqrt" : 17,
+      "Ios16.reduceMean" : 17,
+      "Split" : 3,
+      "Ios18.greaterEqual" : 1,
+      "Select" : 1,
+      "Ios18.sin" : 29,
+      "Tile" : 2,
+      "Ios18.gather" : 18,
+      "Ios18.add" : 132,
+      "Ios18.layerNorm" : 2,
+      "Ios18.reshape" : 56,
+      "Pad" : 17,
+      "Ios18.conv" : 93,
+      "Ios18.concat" : 19,
+      "Ios18.transpose" : 30,
+      "Ios18.sub" : 1,
+      "Ios18.cast" : 19,
+      "Ios18.silu" : 8,
+      "Ios18.gelu" : 2,
+      "Ios18.clip" : 1,
+      "Ios18.convTranspose" : 6,
+      "Ios18.sliceByIndex" : 39,
+      "Ios18.squeeze" : 18
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-12",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 16 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 16, 1]",
+        "name" : "audio_codes",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 256]",
+        "name" : "key_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 8192 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 8192, 1, 256]",
+        "name" : "value_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "kv_cache_update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 256]",
+        "name" : "key_padding_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 4)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 4]",
+        "name" : "hidden_context",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "SpeechDecoder",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/model.mil b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..5781a754daf41969cf19852a37aba79fb499b1c5
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/model.mil
@@ -0,0 +1,2109 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.8.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios18>(tensor<int32, [1, 16, 1]> audio_codes, tensor<int32, [1]> cache_length, tensor<fp16, [1, 1024, 1, 4]> hidden_context, tensor<fp16, [1, 8192, 1, 256]> key_cache, tensor<fp16, [1, 256]> key_padding_mask, tensor<fp16, [1, 256]> kv_cache_update_mask, tensor<fp16, [1, 8192, 1, 256]> value_cache) {
+            int32 var_28 = const()[name = string("op_28"), val = int32(-1)];
+            int32 var_32 = const()[name = string("op_32"), val = int32(1)];
+            tensor<int32, [3]> codes_1_begin_0 = const()[name = string("codes_1_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> codes_1_end_0 = const()[name = string("codes_1_end_0"), val = tensor<int32, [3]>([1, 1, 1])];
+            tensor<bool, [3]> codes_1_end_mask_0 = const()[name = string("codes_1_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<int32, [1, 1, 1]> codes_1 = slice_by_index(begin = codes_1_begin_0, end = codes_1_end_0, end_mask = codes_1_end_mask_0, x = audio_codes)[name = string("codes_1")];
+            tensor<int32, [3]> var_295 = const()[name = string("op_295"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [1]> squeeze_0_axes_0 = const()[name = string("squeeze_0_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_296_to_int16_dtype_0 = const()[name = string("op_296_to_int16_dtype_0"), val = string("int16")];
+            tensor<int32, [1, 1, 1]> var_296 = transpose(perm = var_295, x = codes_1)[name = string("transpose_29")];
+            tensor<int16, [1, 1, 1]> var_296_to_int16 = cast(dtype = var_296_to_int16_dtype_0, x = var_296)[name = string("cast_235")];
+            tensor<int16, [1, 1]> squeeze_0_cast_uint16 = squeeze(axes = squeeze_0_axes_0, x = var_296_to_int16)[name = string("squeeze_0_cast_uint16")];
+            int32 quantized_1_batch_dims_0 = const()[name = string("quantized_1_batch_dims_0"), val = int32(0)];
+            bool quantized_1_validate_indices_0 = const()[name = string("quantized_1_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_1_to_fp16 = const()[name = string("weight_1_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            string cast_216_dtype_0 = const()[name = string("cast_216_dtype_0"), val = string("int32")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<int32, [1, 1]> cast_216 = cast(dtype = cast_216_dtype_0, x = squeeze_0_cast_uint16)[name = string("cast_234")];
+            tensor<bool, [1, 1]> greater_equal_0 = greater_equal(x = cast_216, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(2048)];
+            tensor<int32, [1, 1]> add_0 = add(x = cast_216, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1, 1]> select_0 = select(a = cast_216, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 quantized_1_cast_fp16_cast_uint16_axis_0 = const()[name = string("quantized_1_cast_fp16_cast_uint16_axis_0"), val = int32(0)];
+            string select_0_to_uint16_dtype_0 = const()[name = string("select_0_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1]> select_0_to_uint16 = cast(dtype = select_0_to_uint16_dtype_0, x = select_0)[name = string("cast_233")];
+            tensor<fp16, [1, 1, 256]> quantized_1_cast_fp16_cast_uint16_cast_uint16 = gather(axis = quantized_1_cast_fp16_cast_uint16_axis_0, batch_dims = quantized_1_batch_dims_0, indices = select_0_to_uint16, validate_indices = quantized_1_validate_indices_0, x = weight_1_to_fp16)[name = string("quantized_1_cast_fp16_cast_uint16_cast_uint16")];
+            tensor<int32, [3]> var_304 = const()[name = string("op_304"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_3_axes_0 = const()[name = string("input_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_305_cast_fp16 = transpose(perm = var_304, x = quantized_1_cast_fp16_cast_uint16_cast_uint16)[name = string("transpose_28")];
+            tensor<fp16, [1, 256, 1, 1]> input_3_cast_fp16 = expand_dims(axes = input_3_axes_0, x = var_305_cast_fp16)[name = string("input_3_cast_fp16")];
+            string quantized_pad_type_0 = const()[name = string("quantized_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> quantized_strides_0 = const()[name = string("quantized_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> quantized_pad_0 = const()[name = string("quantized_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> quantized_dilations_0 = const()[name = string("quantized_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 quantized_groups_0 = const()[name = string("quantized_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 256, 1, 1]> decoder_quantizer_rvq_first_output_proj_weight_to_fp16 = const()[name = string("decoder_quantizer_rvq_first_output_proj_weight_to_fp16"), val = tensor<fp16, [512, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1048704)))];
+            tensor<fp16, [1, 512, 1, 1]> quantized_cast_fp16 = conv(dilations = quantized_dilations_0, groups = quantized_groups_0, pad = quantized_pad_0, pad_type = quantized_pad_type_0, strides = quantized_strides_0, weight = decoder_quantizer_rvq_first_output_proj_weight_to_fp16, x = input_3_cast_fp16)[name = string("quantized_cast_fp16")];
+            tensor<int32, [3]> codes_begin_0 = const()[name = string("codes_begin_0"), val = tensor<int32, [3]>([0, 1, 0])];
+            tensor<int32, [3]> codes_end_0 = const()[name = string("codes_end_0"), val = tensor<int32, [3]>([1, 16, 1])];
+            tensor<bool, [3]> codes_end_mask_0 = const()[name = string("codes_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<int32, [1, 15, 1]> codes = slice_by_index(begin = codes_begin_0, end = codes_end_0, end_mask = codes_end_mask_0, x = audio_codes)[name = string("codes")];
+            tensor<int32, [3]> var_315 = const()[name = string("op_315"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [15]> var_317_split_sizes_0 = const()[name = string("op_317_split_sizes_0"), val = tensor<int32, [15]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1310912)))];
+            int32 var_317_axis_0 = const()[name = string("op_317_axis_0"), val = int32(0)];
+            tensor<int32, [15, 1, 1]> var_316 = transpose(perm = var_315, x = codes)[name = string("transpose_27")];
+            tensor<int32, [1, 1, 1]> var_317_0, tensor<int32, [1, 1, 1]> var_317_1, tensor<int32, [1, 1, 1]> var_317_2, tensor<int32, [1, 1, 1]> var_317_3, tensor<int32, [1, 1, 1]> var_317_4, tensor<int32, [1, 1, 1]> var_317_5, tensor<int32, [1, 1, 1]> var_317_6, tensor<int32, [1, 1, 1]> var_317_7, tensor<int32, [1, 1, 1]> var_317_8, tensor<int32, [1, 1, 1]> var_317_9, tensor<int32, [1, 1, 1]> var_317_10, tensor<int32, [1, 1, 1]> var_317_11, tensor<int32, [1, 1, 1]> var_317_12, tensor<int32, [1, 1, 1]> var_317_13, tensor<int32, [1, 1, 1]> var_317_14 = split(axis = var_317_axis_0, split_sizes = var_317_split_sizes_0, x = var_316)[name = string("op_317")];
+            tensor<int32, [1]> squeeze_1_axes_0 = const()[name = string("squeeze_1_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_0_to_uint16_dtype_0 = const()[name = string("op_317_0_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_0_to_uint16 = cast(dtype = var_317_0_to_uint16_dtype_0, x = var_317_0)[name = string("cast_232")];
+            tensor<uint16, [1, 1]> squeeze_1_cast_uint16 = squeeze(axes = squeeze_1_axes_0, x = var_317_0_to_uint16)[name = string("squeeze_1_cast_uint16")];
+            tensor<int32, [1]> squeeze_2_axes_0 = const()[name = string("squeeze_2_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_1_to_uint16_dtype_0 = const()[name = string("op_317_1_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_1_to_uint16 = cast(dtype = var_317_1_to_uint16_dtype_0, x = var_317_1)[name = string("cast_231")];
+            tensor<uint16, [1, 1]> squeeze_2_cast_uint16 = squeeze(axes = squeeze_2_axes_0, x = var_317_1_to_uint16)[name = string("squeeze_2_cast_uint16")];
+            tensor<int32, [1]> squeeze_3_axes_0 = const()[name = string("squeeze_3_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_2_to_uint16_dtype_0 = const()[name = string("op_317_2_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_2_to_uint16 = cast(dtype = var_317_2_to_uint16_dtype_0, x = var_317_2)[name = string("cast_230")];
+            tensor<uint16, [1, 1]> squeeze_3_cast_uint16 = squeeze(axes = squeeze_3_axes_0, x = var_317_2_to_uint16)[name = string("squeeze_3_cast_uint16")];
+            tensor<int32, [1]> squeeze_4_axes_0 = const()[name = string("squeeze_4_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_3_to_uint16_dtype_0 = const()[name = string("op_317_3_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_3_to_uint16 = cast(dtype = var_317_3_to_uint16_dtype_0, x = var_317_3)[name = string("cast_229")];
+            tensor<uint16, [1, 1]> squeeze_4_cast_uint16 = squeeze(axes = squeeze_4_axes_0, x = var_317_3_to_uint16)[name = string("squeeze_4_cast_uint16")];
+            tensor<int32, [1]> squeeze_5_axes_0 = const()[name = string("squeeze_5_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_4_to_uint16_dtype_0 = const()[name = string("op_317_4_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_4_to_uint16 = cast(dtype = var_317_4_to_uint16_dtype_0, x = var_317_4)[name = string("cast_228")];
+            tensor<uint16, [1, 1]> squeeze_5_cast_uint16 = squeeze(axes = squeeze_5_axes_0, x = var_317_4_to_uint16)[name = string("squeeze_5_cast_uint16")];
+            tensor<int32, [1]> squeeze_6_axes_0 = const()[name = string("squeeze_6_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_5_to_uint16_dtype_0 = const()[name = string("op_317_5_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_5_to_uint16 = cast(dtype = var_317_5_to_uint16_dtype_0, x = var_317_5)[name = string("cast_227")];
+            tensor<uint16, [1, 1]> squeeze_6_cast_uint16 = squeeze(axes = squeeze_6_axes_0, x = var_317_5_to_uint16)[name = string("squeeze_6_cast_uint16")];
+            tensor<int32, [1]> squeeze_7_axes_0 = const()[name = string("squeeze_7_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_6_to_uint16_dtype_0 = const()[name = string("op_317_6_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_6_to_uint16 = cast(dtype = var_317_6_to_uint16_dtype_0, x = var_317_6)[name = string("cast_226")];
+            tensor<uint16, [1, 1]> squeeze_7_cast_uint16 = squeeze(axes = squeeze_7_axes_0, x = var_317_6_to_uint16)[name = string("squeeze_7_cast_uint16")];
+            tensor<int32, [1]> squeeze_8_axes_0 = const()[name = string("squeeze_8_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_7_to_uint16_dtype_0 = const()[name = string("op_317_7_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_7_to_uint16 = cast(dtype = var_317_7_to_uint16_dtype_0, x = var_317_7)[name = string("cast_225")];
+            tensor<uint16, [1, 1]> squeeze_8_cast_uint16 = squeeze(axes = squeeze_8_axes_0, x = var_317_7_to_uint16)[name = string("squeeze_8_cast_uint16")];
+            tensor<int32, [1]> squeeze_9_axes_0 = const()[name = string("squeeze_9_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_8_to_uint16_dtype_0 = const()[name = string("op_317_8_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_8_to_uint16 = cast(dtype = var_317_8_to_uint16_dtype_0, x = var_317_8)[name = string("cast_224")];
+            tensor<uint16, [1, 1]> squeeze_9_cast_uint16 = squeeze(axes = squeeze_9_axes_0, x = var_317_8_to_uint16)[name = string("squeeze_9_cast_uint16")];
+            tensor<int32, [1]> squeeze_10_axes_0 = const()[name = string("squeeze_10_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_9_to_uint16_dtype_0 = const()[name = string("op_317_9_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_9_to_uint16 = cast(dtype = var_317_9_to_uint16_dtype_0, x = var_317_9)[name = string("cast_223")];
+            tensor<uint16, [1, 1]> squeeze_10_cast_uint16 = squeeze(axes = squeeze_10_axes_0, x = var_317_9_to_uint16)[name = string("squeeze_10_cast_uint16")];
+            tensor<int32, [1]> squeeze_11_axes_0 = const()[name = string("squeeze_11_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_10_to_uint16_dtype_0 = const()[name = string("op_317_10_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_10_to_uint16 = cast(dtype = var_317_10_to_uint16_dtype_0, x = var_317_10)[name = string("cast_222")];
+            tensor<uint16, [1, 1]> squeeze_11_cast_uint16 = squeeze(axes = squeeze_11_axes_0, x = var_317_10_to_uint16)[name = string("squeeze_11_cast_uint16")];
+            tensor<int32, [1]> squeeze_12_axes_0 = const()[name = string("squeeze_12_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_11_to_uint16_dtype_0 = const()[name = string("op_317_11_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_11_to_uint16 = cast(dtype = var_317_11_to_uint16_dtype_0, x = var_317_11)[name = string("cast_221")];
+            tensor<uint16, [1, 1]> squeeze_12_cast_uint16 = squeeze(axes = squeeze_12_axes_0, x = var_317_11_to_uint16)[name = string("squeeze_12_cast_uint16")];
+            tensor<int32, [1]> squeeze_13_axes_0 = const()[name = string("squeeze_13_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_12_to_uint16_dtype_0 = const()[name = string("op_317_12_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_12_to_uint16 = cast(dtype = var_317_12_to_uint16_dtype_0, x = var_317_12)[name = string("cast_220")];
+            tensor<uint16, [1, 1]> squeeze_13_cast_uint16 = squeeze(axes = squeeze_13_axes_0, x = var_317_12_to_uint16)[name = string("squeeze_13_cast_uint16")];
+            tensor<int32, [1]> squeeze_14_axes_0 = const()[name = string("squeeze_14_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_13_to_uint16_dtype_0 = const()[name = string("op_317_13_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_13_to_uint16 = cast(dtype = var_317_13_to_uint16_dtype_0, x = var_317_13)[name = string("cast_219")];
+            tensor<uint16, [1, 1]> squeeze_14_cast_uint16 = squeeze(axes = squeeze_14_axes_0, x = var_317_13_to_uint16)[name = string("squeeze_14_cast_uint16")];
+            tensor<int32, [1]> squeeze_15_axes_0 = const()[name = string("squeeze_15_axes_0"), val = tensor<int32, [1]>([0])];
+            string var_317_14_to_uint16_dtype_0 = const()[name = string("op_317_14_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1, 1, 1]> var_317_14_to_uint16 = cast(dtype = var_317_14_to_uint16_dtype_0, x = var_317_14)[name = string("cast_218")];
+            tensor<uint16, [1, 1]> squeeze_15_cast_uint16 = squeeze(axes = squeeze_15_axes_0, x = var_317_14_to_uint16)[name = string("squeeze_15_cast_uint16")];
+            int32 quantized_3_axis_0 = const()[name = string("quantized_3_axis_0"), val = int32(0)];
+            int32 quantized_3_batch_dims_0 = const()[name = string("quantized_3_batch_dims_0"), val = int32(0)];
+            bool quantized_3_validate_indices_0 = const()[name = string("quantized_3_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_5_to_fp16 = const()[name = string("weight_5_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(1311040)))];
+            tensor<fp16, [1, 1, 256]> quantized_3_cast_fp16_cast_uint16 = gather(axis = quantized_3_axis_0, batch_dims = quantized_3_batch_dims_0, indices = squeeze_1_cast_uint16, validate_indices = quantized_3_validate_indices_0, x = weight_5_to_fp16)[name = string("quantized_3_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_338 = const()[name = string("op_338"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> quantized_7_axes_0 = const()[name = string("quantized_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_339_cast_fp16 = transpose(perm = var_338, x = quantized_3_cast_fp16_cast_uint16)[name = string("transpose_26")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_7_cast_fp16 = expand_dims(axes = quantized_7_axes_0, x = var_339_cast_fp16)[name = string("quantized_7_cast_fp16")];
+            int32 quantized_5_axis_0 = const()[name = string("quantized_5_axis_0"), val = int32(0)];
+            int32 quantized_5_batch_dims_0 = const()[name = string("quantized_5_batch_dims_0"), val = int32(0)];
+            bool quantized_5_validate_indices_0 = const()[name = string("quantized_5_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_7_to_fp16 = const()[name = string("weight_7_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2359680)))];
+            tensor<fp16, [1, 1, 256]> quantized_5_cast_fp16_cast_uint16 = gather(axis = quantized_5_axis_0, batch_dims = quantized_5_batch_dims_0, indices = squeeze_2_cast_uint16, validate_indices = quantized_5_validate_indices_0, x = weight_7_to_fp16)[name = string("quantized_5_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_346 = const()[name = string("op_346"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_1_axes_0 = const()[name = string("layer_out_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_347_cast_fp16 = transpose(perm = var_346, x = quantized_5_cast_fp16_cast_uint16)[name = string("transpose_25")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_1_cast_fp16 = expand_dims(axes = layer_out_1_axes_0, x = var_347_cast_fp16)[name = string("layer_out_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_11_cast_fp16 = add(x = quantized_7_cast_fp16, y = layer_out_1_cast_fp16)[name = string("quantized_11_cast_fp16")];
+            int32 quantized_9_axis_0 = const()[name = string("quantized_9_axis_0"), val = int32(0)];
+            int32 quantized_9_batch_dims_0 = const()[name = string("quantized_9_batch_dims_0"), val = int32(0)];
+            bool quantized_9_validate_indices_0 = const()[name = string("quantized_9_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_9_to_fp16 = const()[name = string("weight_9_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3408320)))];
+            tensor<fp16, [1, 1, 256]> quantized_9_cast_fp16_cast_uint16 = gather(axis = quantized_9_axis_0, batch_dims = quantized_9_batch_dims_0, indices = squeeze_3_cast_uint16, validate_indices = quantized_9_validate_indices_0, x = weight_9_to_fp16)[name = string("quantized_9_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_355 = const()[name = string("op_355"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_3_axes_0 = const()[name = string("layer_out_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_356_cast_fp16 = transpose(perm = var_355, x = quantized_9_cast_fp16_cast_uint16)[name = string("transpose_24")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_3_cast_fp16 = expand_dims(axes = layer_out_3_axes_0, x = var_356_cast_fp16)[name = string("layer_out_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_15_cast_fp16 = add(x = quantized_11_cast_fp16, y = layer_out_3_cast_fp16)[name = string("quantized_15_cast_fp16")];
+            int32 quantized_13_axis_0 = const()[name = string("quantized_13_axis_0"), val = int32(0)];
+            int32 quantized_13_batch_dims_0 = const()[name = string("quantized_13_batch_dims_0"), val = int32(0)];
+            bool quantized_13_validate_indices_0 = const()[name = string("quantized_13_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_11_to_fp16 = const()[name = string("weight_11_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4456960)))];
+            tensor<fp16, [1, 1, 256]> quantized_13_cast_fp16_cast_uint16 = gather(axis = quantized_13_axis_0, batch_dims = quantized_13_batch_dims_0, indices = squeeze_4_cast_uint16, validate_indices = quantized_13_validate_indices_0, x = weight_11_to_fp16)[name = string("quantized_13_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_364 = const()[name = string("op_364"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_5_axes_0 = const()[name = string("layer_out_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_365_cast_fp16 = transpose(perm = var_364, x = quantized_13_cast_fp16_cast_uint16)[name = string("transpose_23")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_5_cast_fp16 = expand_dims(axes = layer_out_5_axes_0, x = var_365_cast_fp16)[name = string("layer_out_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_19_cast_fp16 = add(x = quantized_15_cast_fp16, y = layer_out_5_cast_fp16)[name = string("quantized_19_cast_fp16")];
+            int32 quantized_17_axis_0 = const()[name = string("quantized_17_axis_0"), val = int32(0)];
+            int32 quantized_17_batch_dims_0 = const()[name = string("quantized_17_batch_dims_0"), val = int32(0)];
+            bool quantized_17_validate_indices_0 = const()[name = string("quantized_17_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_13_to_fp16 = const()[name = string("weight_13_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5505600)))];
+            tensor<fp16, [1, 1, 256]> quantized_17_cast_fp16_cast_uint16 = gather(axis = quantized_17_axis_0, batch_dims = quantized_17_batch_dims_0, indices = squeeze_5_cast_uint16, validate_indices = quantized_17_validate_indices_0, x = weight_13_to_fp16)[name = string("quantized_17_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_373 = const()[name = string("op_373"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_7_axes_0 = const()[name = string("layer_out_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_374_cast_fp16 = transpose(perm = var_373, x = quantized_17_cast_fp16_cast_uint16)[name = string("transpose_22")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_7_cast_fp16 = expand_dims(axes = layer_out_7_axes_0, x = var_374_cast_fp16)[name = string("layer_out_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_23_cast_fp16 = add(x = quantized_19_cast_fp16, y = layer_out_7_cast_fp16)[name = string("quantized_23_cast_fp16")];
+            int32 quantized_21_axis_0 = const()[name = string("quantized_21_axis_0"), val = int32(0)];
+            int32 quantized_21_batch_dims_0 = const()[name = string("quantized_21_batch_dims_0"), val = int32(0)];
+            bool quantized_21_validate_indices_0 = const()[name = string("quantized_21_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_15_to_fp16 = const()[name = string("weight_15_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6554240)))];
+            tensor<fp16, [1, 1, 256]> quantized_21_cast_fp16_cast_uint16 = gather(axis = quantized_21_axis_0, batch_dims = quantized_21_batch_dims_0, indices = squeeze_6_cast_uint16, validate_indices = quantized_21_validate_indices_0, x = weight_15_to_fp16)[name = string("quantized_21_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_382 = const()[name = string("op_382"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_9_axes_0 = const()[name = string("layer_out_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_383_cast_fp16 = transpose(perm = var_382, x = quantized_21_cast_fp16_cast_uint16)[name = string("transpose_21")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_9_cast_fp16 = expand_dims(axes = layer_out_9_axes_0, x = var_383_cast_fp16)[name = string("layer_out_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_27_cast_fp16 = add(x = quantized_23_cast_fp16, y = layer_out_9_cast_fp16)[name = string("quantized_27_cast_fp16")];
+            int32 quantized_25_axis_0 = const()[name = string("quantized_25_axis_0"), val = int32(0)];
+            int32 quantized_25_batch_dims_0 = const()[name = string("quantized_25_batch_dims_0"), val = int32(0)];
+            bool quantized_25_validate_indices_0 = const()[name = string("quantized_25_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_17_to_fp16 = const()[name = string("weight_17_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7602880)))];
+            tensor<fp16, [1, 1, 256]> quantized_25_cast_fp16_cast_uint16 = gather(axis = quantized_25_axis_0, batch_dims = quantized_25_batch_dims_0, indices = squeeze_7_cast_uint16, validate_indices = quantized_25_validate_indices_0, x = weight_17_to_fp16)[name = string("quantized_25_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_391 = const()[name = string("op_391"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_11_axes_0 = const()[name = string("layer_out_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_392_cast_fp16 = transpose(perm = var_391, x = quantized_25_cast_fp16_cast_uint16)[name = string("transpose_20")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_11_cast_fp16 = expand_dims(axes = layer_out_11_axes_0, x = var_392_cast_fp16)[name = string("layer_out_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_31_cast_fp16 = add(x = quantized_27_cast_fp16, y = layer_out_11_cast_fp16)[name = string("quantized_31_cast_fp16")];
+            int32 quantized_29_axis_0 = const()[name = string("quantized_29_axis_0"), val = int32(0)];
+            int32 quantized_29_batch_dims_0 = const()[name = string("quantized_29_batch_dims_0"), val = int32(0)];
+            bool quantized_29_validate_indices_0 = const()[name = string("quantized_29_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_19_to_fp16 = const()[name = string("weight_19_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8651520)))];
+            tensor<fp16, [1, 1, 256]> quantized_29_cast_fp16_cast_uint16 = gather(axis = quantized_29_axis_0, batch_dims = quantized_29_batch_dims_0, indices = squeeze_8_cast_uint16, validate_indices = quantized_29_validate_indices_0, x = weight_19_to_fp16)[name = string("quantized_29_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_400 = const()[name = string("op_400"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_13_axes_0 = const()[name = string("layer_out_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_401_cast_fp16 = transpose(perm = var_400, x = quantized_29_cast_fp16_cast_uint16)[name = string("transpose_19")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_13_cast_fp16 = expand_dims(axes = layer_out_13_axes_0, x = var_401_cast_fp16)[name = string("layer_out_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_35_cast_fp16 = add(x = quantized_31_cast_fp16, y = layer_out_13_cast_fp16)[name = string("quantized_35_cast_fp16")];
+            int32 quantized_33_axis_0 = const()[name = string("quantized_33_axis_0"), val = int32(0)];
+            int32 quantized_33_batch_dims_0 = const()[name = string("quantized_33_batch_dims_0"), val = int32(0)];
+            bool quantized_33_validate_indices_0 = const()[name = string("quantized_33_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_21_to_fp16 = const()[name = string("weight_21_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(9700160)))];
+            tensor<fp16, [1, 1, 256]> quantized_33_cast_fp16_cast_uint16 = gather(axis = quantized_33_axis_0, batch_dims = quantized_33_batch_dims_0, indices = squeeze_9_cast_uint16, validate_indices = quantized_33_validate_indices_0, x = weight_21_to_fp16)[name = string("quantized_33_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_409 = const()[name = string("op_409"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_15_axes_0 = const()[name = string("layer_out_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_410_cast_fp16 = transpose(perm = var_409, x = quantized_33_cast_fp16_cast_uint16)[name = string("transpose_18")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_15_cast_fp16 = expand_dims(axes = layer_out_15_axes_0, x = var_410_cast_fp16)[name = string("layer_out_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_39_cast_fp16 = add(x = quantized_35_cast_fp16, y = layer_out_15_cast_fp16)[name = string("quantized_39_cast_fp16")];
+            int32 quantized_37_axis_0 = const()[name = string("quantized_37_axis_0"), val = int32(0)];
+            int32 quantized_37_batch_dims_0 = const()[name = string("quantized_37_batch_dims_0"), val = int32(0)];
+            bool quantized_37_validate_indices_0 = const()[name = string("quantized_37_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_23_to_fp16 = const()[name = string("weight_23_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10748800)))];
+            tensor<fp16, [1, 1, 256]> quantized_37_cast_fp16_cast_uint16 = gather(axis = quantized_37_axis_0, batch_dims = quantized_37_batch_dims_0, indices = squeeze_10_cast_uint16, validate_indices = quantized_37_validate_indices_0, x = weight_23_to_fp16)[name = string("quantized_37_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_418 = const()[name = string("op_418"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_17_axes_0 = const()[name = string("layer_out_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_419_cast_fp16 = transpose(perm = var_418, x = quantized_37_cast_fp16_cast_uint16)[name = string("transpose_17")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_17_cast_fp16 = expand_dims(axes = layer_out_17_axes_0, x = var_419_cast_fp16)[name = string("layer_out_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_43_cast_fp16 = add(x = quantized_39_cast_fp16, y = layer_out_17_cast_fp16)[name = string("quantized_43_cast_fp16")];
+            int32 quantized_41_axis_0 = const()[name = string("quantized_41_axis_0"), val = int32(0)];
+            int32 quantized_41_batch_dims_0 = const()[name = string("quantized_41_batch_dims_0"), val = int32(0)];
+            bool quantized_41_validate_indices_0 = const()[name = string("quantized_41_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_25_to_fp16 = const()[name = string("weight_25_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(11797440)))];
+            tensor<fp16, [1, 1, 256]> quantized_41_cast_fp16_cast_uint16 = gather(axis = quantized_41_axis_0, batch_dims = quantized_41_batch_dims_0, indices = squeeze_11_cast_uint16, validate_indices = quantized_41_validate_indices_0, x = weight_25_to_fp16)[name = string("quantized_41_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_427 = const()[name = string("op_427"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_19_axes_0 = const()[name = string("layer_out_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_428_cast_fp16 = transpose(perm = var_427, x = quantized_41_cast_fp16_cast_uint16)[name = string("transpose_16")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_19_cast_fp16 = expand_dims(axes = layer_out_19_axes_0, x = var_428_cast_fp16)[name = string("layer_out_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_47_cast_fp16 = add(x = quantized_43_cast_fp16, y = layer_out_19_cast_fp16)[name = string("quantized_47_cast_fp16")];
+            int32 quantized_45_axis_0 = const()[name = string("quantized_45_axis_0"), val = int32(0)];
+            int32 quantized_45_batch_dims_0 = const()[name = string("quantized_45_batch_dims_0"), val = int32(0)];
+            bool quantized_45_validate_indices_0 = const()[name = string("quantized_45_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_27_to_fp16 = const()[name = string("weight_27_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12846080)))];
+            tensor<fp16, [1, 1, 256]> quantized_45_cast_fp16_cast_uint16 = gather(axis = quantized_45_axis_0, batch_dims = quantized_45_batch_dims_0, indices = squeeze_12_cast_uint16, validate_indices = quantized_45_validate_indices_0, x = weight_27_to_fp16)[name = string("quantized_45_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_436 = const()[name = string("op_436"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_21_axes_0 = const()[name = string("layer_out_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_437_cast_fp16 = transpose(perm = var_436, x = quantized_45_cast_fp16_cast_uint16)[name = string("transpose_15")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_21_cast_fp16 = expand_dims(axes = layer_out_21_axes_0, x = var_437_cast_fp16)[name = string("layer_out_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_51_cast_fp16 = add(x = quantized_47_cast_fp16, y = layer_out_21_cast_fp16)[name = string("quantized_51_cast_fp16")];
+            int32 quantized_49_axis_0 = const()[name = string("quantized_49_axis_0"), val = int32(0)];
+            int32 quantized_49_batch_dims_0 = const()[name = string("quantized_49_batch_dims_0"), val = int32(0)];
+            bool quantized_49_validate_indices_0 = const()[name = string("quantized_49_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_29_to_fp16 = const()[name = string("weight_29_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13894720)))];
+            tensor<fp16, [1, 1, 256]> quantized_49_cast_fp16_cast_uint16 = gather(axis = quantized_49_axis_0, batch_dims = quantized_49_batch_dims_0, indices = squeeze_13_cast_uint16, validate_indices = quantized_49_validate_indices_0, x = weight_29_to_fp16)[name = string("quantized_49_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_445 = const()[name = string("op_445"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_23_axes_0 = const()[name = string("layer_out_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_446_cast_fp16 = transpose(perm = var_445, x = quantized_49_cast_fp16_cast_uint16)[name = string("transpose_14")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_23_cast_fp16 = expand_dims(axes = layer_out_23_axes_0, x = var_446_cast_fp16)[name = string("layer_out_23_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_55_cast_fp16 = add(x = quantized_51_cast_fp16, y = layer_out_23_cast_fp16)[name = string("quantized_55_cast_fp16")];
+            int32 quantized_53_axis_0 = const()[name = string("quantized_53_axis_0"), val = int32(0)];
+            int32 quantized_53_batch_dims_0 = const()[name = string("quantized_53_batch_dims_0"), val = int32(0)];
+            bool quantized_53_validate_indices_0 = const()[name = string("quantized_53_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_31_to_fp16 = const()[name = string("weight_31_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14943360)))];
+            tensor<fp16, [1, 1, 256]> quantized_53_cast_fp16_cast_uint16 = gather(axis = quantized_53_axis_0, batch_dims = quantized_53_batch_dims_0, indices = squeeze_14_cast_uint16, validate_indices = quantized_53_validate_indices_0, x = weight_31_to_fp16)[name = string("quantized_53_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_454 = const()[name = string("op_454"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_25_axes_0 = const()[name = string("layer_out_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_455_cast_fp16 = transpose(perm = var_454, x = quantized_53_cast_fp16_cast_uint16)[name = string("transpose_13")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_25_cast_fp16 = expand_dims(axes = layer_out_25_axes_0, x = var_455_cast_fp16)[name = string("layer_out_25_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> quantized_59_cast_fp16 = add(x = quantized_55_cast_fp16, y = layer_out_25_cast_fp16)[name = string("quantized_59_cast_fp16")];
+            int32 quantized_57_axis_0 = const()[name = string("quantized_57_axis_0"), val = int32(0)];
+            int32 quantized_57_batch_dims_0 = const()[name = string("quantized_57_batch_dims_0"), val = int32(0)];
+            bool quantized_57_validate_indices_0 = const()[name = string("quantized_57_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [2048, 256]> weight_33_to_fp16 = const()[name = string("weight_33_to_fp16"), val = tensor<fp16, [2048, 256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15992000)))];
+            tensor<fp16, [1, 1, 256]> quantized_57_cast_fp16_cast_uint16 = gather(axis = quantized_57_axis_0, batch_dims = quantized_57_batch_dims_0, indices = squeeze_15_cast_uint16, validate_indices = quantized_57_validate_indices_0, x = weight_33_to_fp16)[name = string("quantized_57_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> var_463 = const()[name = string("op_463"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> layer_out_axes_0 = const()[name = string("layer_out_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_464_cast_fp16 = transpose(perm = var_463, x = quantized_57_cast_fp16_cast_uint16)[name = string("transpose_12")];
+            tensor<fp16, [1, 256, 1, 1]> layer_out_cast_fp16 = expand_dims(axes = layer_out_axes_0, x = var_464_cast_fp16)[name = string("layer_out_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_35_cast_fp16 = add(x = quantized_59_cast_fp16, y = layer_out_cast_fp16)[name = string("input_35_cast_fp16")];
+            string var_472_pad_type_0 = const()[name = string("op_472_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_472_strides_0 = const()[name = string("op_472_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_472_pad_0 = const()[name = string("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_472_dilations_0 = const()[name = string("op_472_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_472_groups_0 = const()[name = string("op_472_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 256, 1, 1]> decoder_quantizer_rvq_rest_output_proj_weight_to_fp16 = const()[name = string("decoder_quantizer_rvq_rest_output_proj_weight_to_fp16"), val = tensor<fp16, [512, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17040640)))];
+            tensor<fp16, [1, 512, 1, 1]> var_472_cast_fp16 = conv(dilations = var_472_dilations_0, groups = var_472_groups_0, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_472_strides_0, weight = decoder_quantizer_rvq_rest_output_proj_weight_to_fp16, x = input_35_cast_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> x_1_cast_fp16 = add(x = quantized_cast_fp16, y = var_472_cast_fp16)[name = string("x_1_cast_fp16")];
+            tensor<int32, [8]> input_37_pad_0 = const()[name = string("input_37_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 2, 0])];
+            string input_37_mode_0 = const()[name = string("input_37_mode_0"), val = string("constant")];
+            fp16 const_16_to_fp16 = const()[name = string("const_16_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 512, 1, 3]> input_37_cast_fp16 = pad(constant_val = const_16_to_fp16, mode = input_37_mode_0, pad = input_37_pad_0, x = x_1_cast_fp16)[name = string("input_37_cast_fp16")];
+            string input_39_pad_type_0 = const()[name = string("input_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_39_strides_0 = const()[name = string("input_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_39_pad_0 = const()[name = string("input_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_39_dilations_0 = const()[name = string("input_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_39_groups_0 = const()[name = string("input_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 3]> decoder_pre_conv_conv_weight_to_fp16 = const()[name = string("decoder_pre_conv_conv_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 3]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17302848)))];
+            tensor<fp16, [1024]> decoder_pre_conv_conv_bias_to_fp16 = const()[name = string("decoder_pre_conv_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20448640)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_39_cast_fp16 = conv(bias = decoder_pre_conv_conv_bias_to_fp16, dilations = input_39_dilations_0, groups = input_39_groups_0, pad = input_39_pad_0, pad_type = input_39_pad_type_0, strides = input_39_strides_0, weight = decoder_pre_conv_conv_weight_to_fp16, x = input_37_cast_fp16)[name = string("input_39_cast_fp16")];
+            string inputs_1_pad_type_0 = const()[name = string("inputs_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> inputs_1_strides_0 = const()[name = string("inputs_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> inputs_1_pad_0 = const()[name = string("inputs_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> inputs_1_dilations_0 = const()[name = string("inputs_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 inputs_1_groups_0 = const()[name = string("inputs_1_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> decoder_pre_transformer_input_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_input_proj_weight_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20450752)))];
+            tensor<fp16, [512]> decoder_pre_transformer_input_proj_bias_to_fp16 = const()[name = string("decoder_pre_transformer_input_proj_bias_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21499392)))];
+            tensor<fp16, [1, 512, 1, 1]> inputs_1_cast_fp16 = conv(bias = decoder_pre_transformer_input_proj_bias_to_fp16, dilations = inputs_1_dilations_0, groups = inputs_1_groups_0, pad = inputs_1_pad_0, pad_type = inputs_1_pad_type_0, strides = inputs_1_strides_0, weight = decoder_pre_transformer_input_proj_weight_to_fp16, x = input_39_cast_fp16)[name = string("inputs_1_cast_fp16")];
+            int32 pos_cos_1_axis_0 = const()[name = string("pos_cos_1_axis_0"), val = int32(0)];
+            int32 pos_cos_1_batch_dims_0 = const()[name = string("pos_cos_1_batch_dims_0"), val = int32(0)];
+            bool pos_cos_1_validate_indices_0 = const()[name = string("pos_cos_1_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 64]> decoder_pre_transformer_position_embeddings_cos_weight_to_fp16 = const()[name = string("decoder_pre_transformer_position_embeddings_cos_weight_to_fp16"), val = tensor<fp16, [256, 64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21500480)))];
+            string cache_length_to_uint16_dtype_0 = const()[name = string("cache_length_to_uint16_dtype_0"), val = string("uint16")];
+            tensor<uint16, [1]> cache_length_to_uint16 = cast(dtype = cache_length_to_uint16_dtype_0, x = cache_length)[name = string("cast_217")];
+            tensor<fp16, [1, 64]> pos_cos_1_cast_fp16_cast_uint16 = gather(axis = pos_cos_1_axis_0, batch_dims = pos_cos_1_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_cos_1_validate_indices_0, x = decoder_pre_transformer_position_embeddings_cos_weight_to_fp16)[name = string("pos_cos_1_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> pos_cos_axes_0 = const()[name = string("pos_cos_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 64, 1]> pos_cos_cast_fp16 = expand_dims(axes = pos_cos_axes_0, x = pos_cos_1_cast_fp16_cast_uint16)[name = string("pos_cos_cast_fp16")];
+            int32 pos_sin_1_axis_0 = const()[name = string("pos_sin_1_axis_0"), val = int32(0)];
+            int32 pos_sin_1_batch_dims_0 = const()[name = string("pos_sin_1_batch_dims_0"), val = int32(0)];
+            bool pos_sin_1_validate_indices_0 = const()[name = string("pos_sin_1_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [256, 64]> decoder_pre_transformer_position_embeddings_sin_weight_to_fp16 = const()[name = string("decoder_pre_transformer_position_embeddings_sin_weight_to_fp16"), val = tensor<fp16, [256, 64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21533312)))];
+            tensor<fp16, [1, 64]> pos_sin_1_cast_fp16_cast_uint16 = gather(axis = pos_sin_1_axis_0, batch_dims = pos_sin_1_batch_dims_0, indices = cache_length_to_uint16, validate_indices = pos_sin_1_validate_indices_0, x = decoder_pre_transformer_position_embeddings_sin_weight_to_fp16)[name = string("pos_sin_1_cast_fp16_cast_uint16")];
+            tensor<int32, [1]> pos_sin_axes_0 = const()[name = string("pos_sin_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 64, 1]> pos_sin_cast_fp16 = expand_dims(axes = pos_sin_axes_0, x = pos_sin_1_cast_fp16_cast_uint16)[name = string("pos_sin_cast_fp16")];
+            tensor<int32, [8]> tile_0 = const()[name = string("tile_0"), val = tensor<int32, [8]>([1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024])];
+            int32 var_522_axis_0 = const()[name = string("op_522_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_522_cast_fp16_7 = split(axis = var_522_axis_0, split_sizes = tile_0, x = key_cache)[name = string("op_522_cast_fp16")];
+            tensor<int32, [8]> tile_1 = const()[name = string("tile_1"), val = tensor<int32, [8]>([1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024])];
+            int32 var_531_axis_0 = const()[name = string("op_531_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_0, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_1, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_2, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_3, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_4, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_5, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_6, tensor<fp16, [1, 1024, 1, 256]> var_531_cast_fp16_7 = split(axis = var_531_axis_0, split_sizes = tile_1, x = value_cache)[name = string("op_531_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = inputs_1_cast_fp16)[name = string("inputs_sq_1_cast_fp16")];
+            tensor<int32, [1]> variance_1_axes_0 = const()[name = string("variance_1_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_1_keep_dims_0 = const()[name = string("variance_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_1_cast_fp16 = reduce_mean(axes = variance_1_axes_0, keep_dims = variance_1_keep_dims_0, x = inputs_sq_1_cast_fp16)[name = string("variance_1_cast_fp16")];
+            fp16 var_550_to_fp16 = const()[name = string("op_550_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_551_cast_fp16 = add(x = variance_1_cast_fp16, y = var_550_to_fp16)[name = string("op_551_cast_fp16")];
+            fp32 var_552_epsilon_0 = const()[name = string("op_552_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_552_cast_fp16 = rsqrt(epsilon = var_552_epsilon_0, x = var_551_cast_fp16)[name = string("op_552_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_1_cast_fp16 = mul(x = inputs_1_cast_fp16, y = var_552_cast_fp16)[name = string("hidden_states_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_1_to_fp16 = const()[name = string("w_1_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21566144)))];
+            tensor<fp16, [1, 512, 1, 1]> x_3_cast_fp16 = mul(x = w_1_to_fp16, y = hidden_states_1_cast_fp16)[name = string("x_3_cast_fp16")];
+            string q_1_pad_type_0 = const()[name = string("q_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_1_strides_0 = const()[name = string("q_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = string("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_1_dilations_0 = const()[name = string("q_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_1_groups_0 = const()[name = string("q_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_0_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21567232)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_1_cast_fp16 = conv(dilations = q_1_dilations_0, groups = q_1_groups_0, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = q_1_strides_0, weight = decoder_pre_transformer_layers_0_self_attn_q_proj_weight_to_fp16, x = x_3_cast_fp16)[name = string("q_1_cast_fp16")];
+            string k_1_pad_type_0 = const()[name = string("k_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_1_strides_0 = const()[name = string("k_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = string("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_1_dilations_0 = const()[name = string("k_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_1_groups_0 = const()[name = string("k_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_0_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22615872)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_1_cast_fp16 = conv(dilations = k_1_dilations_0, groups = k_1_groups_0, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = k_1_strides_0, weight = decoder_pre_transformer_layers_0_self_attn_k_proj_weight_to_fp16, x = x_3_cast_fp16)[name = string("k_1_cast_fp16")];
+            string v_1_pad_type_0 = const()[name = string("v_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_1_strides_0 = const()[name = string("v_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = string("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_1_dilations_0 = const()[name = string("v_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_1_groups_0 = const()[name = string("v_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_0_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(23664512)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_1_cast_fp16 = conv(dilations = v_1_dilations_0, groups = v_1_groups_0, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = v_1_strides_0, weight = decoder_pre_transformer_layers_0_self_attn_v_proj_weight_to_fp16, x = x_3_cast_fp16)[name = string("v_1_cast_fp16")];
+            tensor<int32, [4]> var_584 = const()[name = string("op_584"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_3_cast_fp16 = reshape(shape = var_584, x = q_1_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<int32, [4]> var_589 = const()[name = string("op_589"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_3_cast_fp16 = reshape(shape = var_589, x = k_1_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<int32, [1]> cos_expanded_1_axes_0 = const()[name = string("cos_expanded_1_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 64, 1, 1]> cos_expanded_1_cast_fp16 = expand_dims(axes = cos_expanded_1_axes_0, x = pos_cos_cast_fp16)[name = string("cos_expanded_1_cast_fp16")];
+            tensor<int32, [1]> sin_expanded_1_axes_0 = const()[name = string("sin_expanded_1_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 64, 1, 1]> sin_expanded_1_cast_fp16 = expand_dims(axes = sin_expanded_1_axes_0, x = pos_sin_cast_fp16)[name = string("sin_expanded_1_cast_fp16")];
+            tensor<int32, [4]> var_593 = const()[name = string("op_593"), val = tensor<int32, [4]>([16, 1, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> cos_1_cast_fp16 = tile(reps = var_593, x = cos_expanded_1_cast_fp16)[name = string("cos_1_cast_fp16")];
+            tensor<int32, [4]> var_595 = const()[name = string("op_595"), val = tensor<int32, [4]>([16, 1, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> sin_1_cast_fp16 = tile(reps = var_595, x = sin_expanded_1_cast_fp16)[name = string("sin_1_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_601_cast_fp16 = mul(x = q_3_cast_fp16, y = cos_1_cast_fp16)[name = string("op_601_cast_fp16")];
+            tensor<int32, [4]> var_606_begin_0 = const()[name = string("op_606_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_606_end_0 = const()[name = string("op_606_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_606_end_mask_0 = const()[name = string("op_606_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_606_cast_fp16 = slice_by_index(begin = var_606_begin_0, end = var_606_end_0, end_mask = var_606_end_mask_0, x = q_3_cast_fp16)[name = string("op_606_cast_fp16")];
+            tensor<int32, [4]> var_613_begin_0 = const()[name = string("op_613_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_613_end_0 = const()[name = string("op_613_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_613_end_mask_0 = const()[name = string("op_613_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_613_cast_fp16 = slice_by_index(begin = var_613_begin_0, end = var_613_end_0, end_mask = var_613_end_mask_0, x = q_3_cast_fp16)[name = string("op_613_cast_fp16")];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_616_cast_fp16 = mul(x = var_613_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_616_cast_fp16")];
+            bool var_618_interleave_0 = const()[name = string("op_618_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_618_cast_fp16 = concat(axis = var_32, interleave = var_618_interleave_0, values = (var_616_cast_fp16, var_606_cast_fp16))[name = string("op_618_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_619_cast_fp16 = mul(x = var_618_cast_fp16, y = sin_1_cast_fp16)[name = string("op_619_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_1_cast_fp16 = add(x = var_601_cast_fp16, y = var_619_cast_fp16)[name = string("q_rotated_1_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_621_cast_fp16 = mul(x = k_3_cast_fp16, y = cos_1_cast_fp16)[name = string("op_621_cast_fp16")];
+            tensor<int32, [4]> var_626_begin_0 = const()[name = string("op_626_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_626_end_0 = const()[name = string("op_626_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_626_end_mask_0 = const()[name = string("op_626_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_626_cast_fp16 = slice_by_index(begin = var_626_begin_0, end = var_626_end_0, end_mask = var_626_end_mask_0, x = k_3_cast_fp16)[name = string("op_626_cast_fp16")];
+            tensor<int32, [4]> var_633_begin_0 = const()[name = string("op_633_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_633_end_0 = const()[name = string("op_633_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_633_end_mask_0 = const()[name = string("op_633_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_633_cast_fp16 = slice_by_index(begin = var_633_begin_0, end = var_633_end_0, end_mask = var_633_end_mask_0, x = k_3_cast_fp16)[name = string("op_633_cast_fp16")];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_636_cast_fp16 = mul(x = var_633_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_636_cast_fp16")];
+            bool var_638_interleave_0 = const()[name = string("op_638_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_638_cast_fp16 = concat(axis = var_32, interleave = var_638_interleave_0, values = (var_636_cast_fp16, var_626_cast_fp16))[name = string("op_638_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_639_cast_fp16 = mul(x = var_638_cast_fp16, y = sin_1_cast_fp16)[name = string("op_639_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_1_cast_fp16 = add(x = var_621_cast_fp16, y = var_639_cast_fp16)[name = string("k_rotated_1_cast_fp16")];
+            tensor<int32, [4]> var_643 = const()[name = string("op_643"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_1_cast_fp16 = reshape(shape = var_643, x = k_rotated_1_cast_fp16)[name = string("current_key_1_cast_fp16")];
+            tensor<int32, [1]> var_645_axes_0 = const()[name = string("op_645_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_645_cast_fp16 = expand_dims(axes = var_645_axes_0, x = kv_cache_update_mask)[name = string("op_645_cast_fp16")];
+            tensor<int32, [1]> update_mask_1_axes_0 = const()[name = string("update_mask_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> update_mask_1_cast_fp16 = expand_dims(axes = update_mask_1_axes_0, x = var_645_cast_fp16)[name = string("update_mask_1_cast_fp16")];
+            fp16 var_32_promoted_to_fp16 = const()[name = string("op_32_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 256]> var_647_cast_fp16 = sub(x = var_32_promoted_to_fp16, y = update_mask_1_cast_fp16)[name = string("op_647_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_648_cast_fp16 = mul(x = var_522_cast_fp16_0, y = var_647_cast_fp16)[name = string("op_648_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_649_cast_fp16 = mul(x = current_key_1_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_649_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_1_cast_fp16 = add(x = var_648_cast_fp16, y = var_649_cast_fp16)[name = string("key_cache_updated_1_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_652_cast_fp16 = mul(x = var_531_cast_fp16_0, y = var_647_cast_fp16)[name = string("op_652_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_653_cast_fp16 = mul(x = v_1_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_653_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_1_cast_fp16 = add(x = var_652_cast_fp16, y = var_653_cast_fp16)[name = string("value_cache_updated_1_cast_fp16")];
+            tensor<int32, [4]> var_655 = const()[name = string("op_655"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_7_cast_fp16 = reshape(shape = var_655, x = q_rotated_1_cast_fp16)[name = string("q_7_cast_fp16")];
+            tensor<int32, [4]> var_658 = const()[name = string("op_658"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_1_cast_fp16 = reshape(shape = var_658, x = key_cache_updated_1_cast_fp16)[name = string("k_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> var_660 = const()[name = string("op_660"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_1_cast_fp16 = reshape(shape = var_660, x = value_cache_updated_1_cast_fp16)[name = string("v_for_attn_1_cast_fp16")];
+            bool var_664_transpose_x_1 = const()[name = string("op_664_transpose_x_1"), val = bool(true)];
+            bool var_664_transpose_y_1 = const()[name = string("op_664_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_664_cast_fp16 = matmul(transpose_x = var_664_transpose_x_1, transpose_y = var_664_transpose_y_1, x = q_7_cast_fp16, y = k_for_attn_1_cast_fp16)[name = string("op_664_cast_fp16")];
+            fp16 var_665_to_fp16 = const()[name = string("op_665_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_1_cast_fp16 = mul(x = var_664_cast_fp16, y = var_665_to_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<int32, [1]> var_667_axes_0 = const()[name = string("op_667_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 256]> var_667_cast_fp16 = expand_dims(axes = var_667_axes_0, x = key_padding_mask)[name = string("op_667_cast_fp16")];
+            tensor<int32, [1]> attn_mask_1_axes_0 = const()[name = string("attn_mask_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 256]> attn_mask_1_cast_fp16 = expand_dims(axes = attn_mask_1_axes_0, x = var_667_cast_fp16)[name = string("attn_mask_1_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_41_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_41_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_3_cast_fp16 = softmax(axis = var_28, x = input_41_cast_fp16)[name = string("attn_weights_3_cast_fp16")];
+            bool attn_output_1_transpose_x_1 = const()[name = string("attn_output_1_transpose_x_1"), val = bool(false)];
+            bool attn_output_1_transpose_y_1 = const()[name = string("attn_output_1_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_1, transpose_y = attn_output_1_transpose_y_1, x = attn_weights_3_cast_fp16, y = v_for_attn_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = string("op_674"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_676 = const()[name = string("op_676"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_3_cast_fp16 = transpose(perm = var_674, x = attn_output_1_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 1024, 1, 1]> input_43_cast_fp16 = reshape(shape = var_676, x = attn_output_3_cast_fp16)[name = string("input_43_cast_fp16")];
+            string x_5_pad_type_0 = const()[name = string("x_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_5_strides_0 = const()[name = string("x_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_5_pad_0 = const()[name = string("x_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_5_dilations_0 = const()[name = string("x_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_5_groups_0 = const()[name = string("x_5_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_691_weight_0_to_fp16 = const()[name = string("op_691_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24713152)))];
+            tensor<fp16, [512]> var_691_bias_0_to_fp16 = const()[name = string("op_691_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25761792)))];
+            tensor<fp16, [1, 512, 1, 1]> var_691_cast_fp16 = conv(bias = var_691_bias_0_to_fp16, dilations = x_5_dilations_0, groups = x_5_groups_0, pad = x_5_pad_0, pad_type = x_5_pad_type_0, strides = x_5_strides_0, weight = var_691_weight_0_to_fp16, x = input_43_cast_fp16)[name = string("op_691_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_3_cast_fp16 = add(x = inputs_1_cast_fp16, y = var_691_cast_fp16)[name = string("inputs_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = inputs_3_cast_fp16)[name = string("inputs_sq_3_cast_fp16")];
+            tensor<int32, [1]> variance_3_axes_0 = const()[name = string("variance_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_3_keep_dims_0 = const()[name = string("variance_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_3_cast_fp16 = reduce_mean(axes = variance_3_axes_0, keep_dims = variance_3_keep_dims_0, x = inputs_sq_3_cast_fp16)[name = string("variance_3_cast_fp16")];
+            fp16 var_697_to_fp16 = const()[name = string("op_697_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_698_cast_fp16 = add(x = variance_3_cast_fp16, y = var_697_to_fp16)[name = string("op_698_cast_fp16")];
+            fp32 var_699_epsilon_0 = const()[name = string("op_699_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_699_cast_fp16 = rsqrt(epsilon = var_699_epsilon_0, x = var_698_cast_fp16)[name = string("op_699_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_3_cast_fp16 = mul(x = inputs_3_cast_fp16, y = var_699_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_3_to_fp16 = const()[name = string("w_3_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25762880)))];
+            tensor<fp16, [1, 512, 1, 1]> input_45_cast_fp16 = mul(x = w_3_to_fp16, y = hidden_states_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string input_47_pad_type_0 = const()[name = string("input_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_47_strides_0 = const()[name = string("input_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_47_pad_0 = const()[name = string("input_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_47_dilations_0 = const()[name = string("input_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_47_groups_0 = const()[name = string("input_47_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_0_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(25763968)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_47_cast_fp16 = conv(dilations = input_47_dilations_0, groups = input_47_groups_0, pad = input_47_pad_0, pad_type = input_47_pad_type_0, strides = input_47_strides_0, weight = decoder_pre_transformer_layers_0_mlp_gate_proj_weight_to_fp16, x = input_45_cast_fp16)[name = string("input_47_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_713_cast_fp16 = silu(x = input_47_cast_fp16)[name = string("op_713_cast_fp16")];
+            string var_719_pad_type_0 = const()[name = string("op_719_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_719_strides_0 = const()[name = string("op_719_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_719_pad_0 = const()[name = string("op_719_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_719_dilations_0 = const()[name = string("op_719_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_719_groups_0 = const()[name = string("op_719_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_0_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_0_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26812608)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_719_cast_fp16 = conv(dilations = var_719_dilations_0, groups = var_719_groups_0, pad = var_719_pad_0, pad_type = var_719_pad_type_0, strides = var_719_strides_0, weight = decoder_pre_transformer_layers_0_mlp_up_proj_weight_to_fp16, x = input_45_cast_fp16)[name = string("op_719_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_49_cast_fp16 = mul(x = var_713_cast_fp16, y = var_719_cast_fp16)[name = string("input_49_cast_fp16")];
+            string x_7_pad_type_0 = const()[name = string("x_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_7_strides_0 = const()[name = string("x_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_7_pad_0 = const()[name = string("x_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_7_dilations_0 = const()[name = string("x_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_7_groups_0 = const()[name = string("x_7_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_730_weight_0_to_fp16 = const()[name = string("op_730_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(27861248)))];
+            tensor<fp16, [512]> var_730_bias_0_to_fp16 = const()[name = string("op_730_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28909888)))];
+            tensor<fp16, [1, 512, 1, 1]> var_730_cast_fp16 = conv(bias = var_730_bias_0_to_fp16, dilations = x_7_dilations_0, groups = x_7_groups_0, pad = x_7_pad_0, pad_type = x_7_pad_type_0, strides = x_7_strides_0, weight = var_730_weight_0_to_fp16, x = input_49_cast_fp16)[name = string("op_730_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_5_cast_fp16 = add(x = inputs_3_cast_fp16, y = var_730_cast_fp16)[name = string("inputs_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = inputs_5_cast_fp16)[name = string("inputs_sq_5_cast_fp16")];
+            tensor<int32, [1]> variance_5_axes_0 = const()[name = string("variance_5_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_5_keep_dims_0 = const()[name = string("variance_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_5_cast_fp16 = reduce_mean(axes = variance_5_axes_0, keep_dims = variance_5_keep_dims_0, x = inputs_sq_5_cast_fp16)[name = string("variance_5_cast_fp16")];
+            fp16 var_746_to_fp16 = const()[name = string("op_746_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_747_cast_fp16 = add(x = variance_5_cast_fp16, y = var_746_to_fp16)[name = string("op_747_cast_fp16")];
+            fp32 var_748_epsilon_0 = const()[name = string("op_748_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_748_cast_fp16 = rsqrt(epsilon = var_748_epsilon_0, x = var_747_cast_fp16)[name = string("op_748_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_5_cast_fp16 = mul(x = inputs_5_cast_fp16, y = var_748_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_5_to_fp16 = const()[name = string("w_5_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28910976)))];
+            tensor<fp16, [1, 512, 1, 1]> x_9_cast_fp16 = mul(x = w_5_to_fp16, y = hidden_states_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            string q_9_pad_type_0 = const()[name = string("q_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_9_strides_0 = const()[name = string("q_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = string("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_9_dilations_0 = const()[name = string("q_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_9_groups_0 = const()[name = string("q_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_1_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28912064)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_9_cast_fp16 = conv(dilations = q_9_dilations_0, groups = q_9_groups_0, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = q_9_strides_0, weight = decoder_pre_transformer_layers_1_self_attn_q_proj_weight_to_fp16, x = x_9_cast_fp16)[name = string("q_9_cast_fp16")];
+            string k_5_pad_type_0 = const()[name = string("k_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_5_strides_0 = const()[name = string("k_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_5_pad_0 = const()[name = string("k_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_5_dilations_0 = const()[name = string("k_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_5_groups_0 = const()[name = string("k_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_1_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29960704)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_5_cast_fp16 = conv(dilations = k_5_dilations_0, groups = k_5_groups_0, pad = k_5_pad_0, pad_type = k_5_pad_type_0, strides = k_5_strides_0, weight = decoder_pre_transformer_layers_1_self_attn_k_proj_weight_to_fp16, x = x_9_cast_fp16)[name = string("k_5_cast_fp16")];
+            string v_3_pad_type_0 = const()[name = string("v_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_3_strides_0 = const()[name = string("v_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_3_pad_0 = const()[name = string("v_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_3_dilations_0 = const()[name = string("v_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_3_groups_0 = const()[name = string("v_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_1_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31009344)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_3_cast_fp16 = conv(dilations = v_3_dilations_0, groups = v_3_groups_0, pad = v_3_pad_0, pad_type = v_3_pad_type_0, strides = v_3_strides_0, weight = decoder_pre_transformer_layers_1_self_attn_v_proj_weight_to_fp16, x = x_9_cast_fp16)[name = string("v_3_cast_fp16")];
+            tensor<int32, [4]> var_780 = const()[name = string("op_780"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_11_cast_fp16 = reshape(shape = var_780, x = q_9_cast_fp16)[name = string("q_11_cast_fp16")];
+            tensor<int32, [4]> var_785 = const()[name = string("op_785"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_7_cast_fp16 = reshape(shape = var_785, x = k_5_cast_fp16)[name = string("k_7_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_797_cast_fp16 = mul(x = q_11_cast_fp16, y = cos_1_cast_fp16)[name = string("op_797_cast_fp16")];
+            tensor<int32, [4]> var_802_begin_0 = const()[name = string("op_802_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_802_end_0 = const()[name = string("op_802_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_802_end_mask_0 = const()[name = string("op_802_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_802_cast_fp16 = slice_by_index(begin = var_802_begin_0, end = var_802_end_0, end_mask = var_802_end_mask_0, x = q_11_cast_fp16)[name = string("op_802_cast_fp16")];
+            tensor<int32, [4]> var_809_begin_0 = const()[name = string("op_809_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_809_end_0 = const()[name = string("op_809_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_809_end_mask_0 = const()[name = string("op_809_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_809_cast_fp16 = slice_by_index(begin = var_809_begin_0, end = var_809_end_0, end_mask = var_809_end_mask_0, x = q_11_cast_fp16)[name = string("op_809_cast_fp16")];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_812_cast_fp16 = mul(x = var_809_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_812_cast_fp16")];
+            bool var_814_interleave_0 = const()[name = string("op_814_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_814_cast_fp16 = concat(axis = var_32, interleave = var_814_interleave_0, values = (var_812_cast_fp16, var_802_cast_fp16))[name = string("op_814_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_815_cast_fp16 = mul(x = var_814_cast_fp16, y = sin_1_cast_fp16)[name = string("op_815_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_3_cast_fp16 = add(x = var_797_cast_fp16, y = var_815_cast_fp16)[name = string("q_rotated_3_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_817_cast_fp16 = mul(x = k_7_cast_fp16, y = cos_1_cast_fp16)[name = string("op_817_cast_fp16")];
+            tensor<int32, [4]> var_822_begin_0 = const()[name = string("op_822_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_822_end_0 = const()[name = string("op_822_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_822_end_mask_0 = const()[name = string("op_822_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_822_cast_fp16 = slice_by_index(begin = var_822_begin_0, end = var_822_end_0, end_mask = var_822_end_mask_0, x = k_7_cast_fp16)[name = string("op_822_cast_fp16")];
+            tensor<int32, [4]> var_829_begin_0 = const()[name = string("op_829_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_829_end_0 = const()[name = string("op_829_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_829_end_mask_0 = const()[name = string("op_829_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_829_cast_fp16 = slice_by_index(begin = var_829_begin_0, end = var_829_end_0, end_mask = var_829_end_mask_0, x = k_7_cast_fp16)[name = string("op_829_cast_fp16")];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_832_cast_fp16 = mul(x = var_829_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_832_cast_fp16")];
+            bool var_834_interleave_0 = const()[name = string("op_834_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_834_cast_fp16 = concat(axis = var_32, interleave = var_834_interleave_0, values = (var_832_cast_fp16, var_822_cast_fp16))[name = string("op_834_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_835_cast_fp16 = mul(x = var_834_cast_fp16, y = sin_1_cast_fp16)[name = string("op_835_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_3_cast_fp16 = add(x = var_817_cast_fp16, y = var_835_cast_fp16)[name = string("k_rotated_3_cast_fp16")];
+            tensor<int32, [4]> var_839 = const()[name = string("op_839"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_3_cast_fp16 = reshape(shape = var_839, x = k_rotated_3_cast_fp16)[name = string("current_key_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_844_cast_fp16 = mul(x = var_522_cast_fp16_1, y = var_647_cast_fp16)[name = string("op_844_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_845_cast_fp16 = mul(x = current_key_3_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_845_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_3_cast_fp16 = add(x = var_844_cast_fp16, y = var_845_cast_fp16)[name = string("key_cache_updated_3_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_848_cast_fp16 = mul(x = var_531_cast_fp16_1, y = var_647_cast_fp16)[name = string("op_848_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_849_cast_fp16 = mul(x = v_3_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_849_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_3_cast_fp16 = add(x = var_848_cast_fp16, y = var_849_cast_fp16)[name = string("value_cache_updated_3_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = string("op_851"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_15_cast_fp16 = reshape(shape = var_851, x = q_rotated_3_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<int32, [4]> var_854 = const()[name = string("op_854"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_3_cast_fp16 = reshape(shape = var_854, x = key_cache_updated_3_cast_fp16)[name = string("k_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> var_856 = const()[name = string("op_856"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_3_cast_fp16 = reshape(shape = var_856, x = value_cache_updated_3_cast_fp16)[name = string("v_for_attn_3_cast_fp16")];
+            bool var_860_transpose_x_1 = const()[name = string("op_860_transpose_x_1"), val = bool(true)];
+            bool var_860_transpose_y_1 = const()[name = string("op_860_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_860_cast_fp16 = matmul(transpose_x = var_860_transpose_x_1, transpose_y = var_860_transpose_y_1, x = q_15_cast_fp16, y = k_for_attn_3_cast_fp16)[name = string("op_860_cast_fp16")];
+            fp16 var_861_to_fp16 = const()[name = string("op_861_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_5_cast_fp16 = mul(x = var_860_cast_fp16, y = var_861_to_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_51_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_51_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_7_cast_fp16 = softmax(axis = var_28, x = input_51_cast_fp16)[name = string("attn_weights_7_cast_fp16")];
+            bool attn_output_5_transpose_x_1 = const()[name = string("attn_output_5_transpose_x_1"), val = bool(false)];
+            bool attn_output_5_transpose_y_1 = const()[name = string("attn_output_5_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_5_cast_fp16 = matmul(transpose_x = attn_output_5_transpose_x_1, transpose_y = attn_output_5_transpose_y_1, x = attn_weights_7_cast_fp16, y = v_for_attn_3_cast_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<int32, [4]> var_870 = const()[name = string("op_870"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_872 = const()[name = string("op_872"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_7_cast_fp16 = transpose(perm = var_870, x = attn_output_5_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [1, 1024, 1, 1]> input_53_cast_fp16 = reshape(shape = var_872, x = attn_output_7_cast_fp16)[name = string("input_53_cast_fp16")];
+            string x_11_pad_type_0 = const()[name = string("x_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_11_strides_0 = const()[name = string("x_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_11_pad_0 = const()[name = string("x_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_11_dilations_0 = const()[name = string("x_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_11_groups_0 = const()[name = string("x_11_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_887_weight_0_to_fp16 = const()[name = string("op_887_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32057984)))];
+            tensor<fp16, [512]> var_887_bias_0_to_fp16 = const()[name = string("op_887_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33106624)))];
+            tensor<fp16, [1, 512, 1, 1]> var_887_cast_fp16 = conv(bias = var_887_bias_0_to_fp16, dilations = x_11_dilations_0, groups = x_11_groups_0, pad = x_11_pad_0, pad_type = x_11_pad_type_0, strides = x_11_strides_0, weight = var_887_weight_0_to_fp16, x = input_53_cast_fp16)[name = string("op_887_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = var_887_cast_fp16)[name = string("inputs_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = inputs_7_cast_fp16)[name = string("inputs_sq_7_cast_fp16")];
+            tensor<int32, [1]> variance_7_axes_0 = const()[name = string("variance_7_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_7_keep_dims_0 = const()[name = string("variance_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_7_cast_fp16 = reduce_mean(axes = variance_7_axes_0, keep_dims = variance_7_keep_dims_0, x = inputs_sq_7_cast_fp16)[name = string("variance_7_cast_fp16")];
+            fp16 var_893_to_fp16 = const()[name = string("op_893_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_894_cast_fp16 = add(x = variance_7_cast_fp16, y = var_893_to_fp16)[name = string("op_894_cast_fp16")];
+            fp32 var_895_epsilon_0 = const()[name = string("op_895_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_895_cast_fp16 = rsqrt(epsilon = var_895_epsilon_0, x = var_894_cast_fp16)[name = string("op_895_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_7_cast_fp16 = mul(x = inputs_7_cast_fp16, y = var_895_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_7_to_fp16 = const()[name = string("w_7_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33107712)))];
+            tensor<fp16, [1, 512, 1, 1]> input_55_cast_fp16 = mul(x = w_7_to_fp16, y = hidden_states_7_cast_fp16)[name = string("input_55_cast_fp16")];
+            string input_57_pad_type_0 = const()[name = string("input_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_57_strides_0 = const()[name = string("input_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_57_pad_0 = const()[name = string("input_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_57_dilations_0 = const()[name = string("input_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_57_groups_0 = const()[name = string("input_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_1_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(33108800)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_57_cast_fp16 = conv(dilations = input_57_dilations_0, groups = input_57_groups_0, pad = input_57_pad_0, pad_type = input_57_pad_type_0, strides = input_57_strides_0, weight = decoder_pre_transformer_layers_1_mlp_gate_proj_weight_to_fp16, x = input_55_cast_fp16)[name = string("input_57_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_909_cast_fp16 = silu(x = input_57_cast_fp16)[name = string("op_909_cast_fp16")];
+            string var_915_pad_type_0 = const()[name = string("op_915_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_915_strides_0 = const()[name = string("op_915_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_915_pad_0 = const()[name = string("op_915_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_915_dilations_0 = const()[name = string("op_915_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_915_groups_0 = const()[name = string("op_915_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_1_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_1_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34157440)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_915_cast_fp16 = conv(dilations = var_915_dilations_0, groups = var_915_groups_0, pad = var_915_pad_0, pad_type = var_915_pad_type_0, strides = var_915_strides_0, weight = decoder_pre_transformer_layers_1_mlp_up_proj_weight_to_fp16, x = input_55_cast_fp16)[name = string("op_915_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_59_cast_fp16 = mul(x = var_909_cast_fp16, y = var_915_cast_fp16)[name = string("input_59_cast_fp16")];
+            string x_13_pad_type_0 = const()[name = string("x_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_13_strides_0 = const()[name = string("x_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_13_pad_0 = const()[name = string("x_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_13_dilations_0 = const()[name = string("x_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_13_groups_0 = const()[name = string("x_13_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_926_weight_0_to_fp16 = const()[name = string("op_926_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(35206080)))];
+            tensor<fp16, [512]> var_926_bias_0_to_fp16 = const()[name = string("op_926_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36254720)))];
+            tensor<fp16, [1, 512, 1, 1]> var_926_cast_fp16 = conv(bias = var_926_bias_0_to_fp16, dilations = x_13_dilations_0, groups = x_13_groups_0, pad = x_13_pad_0, pad_type = x_13_pad_type_0, strides = x_13_strides_0, weight = var_926_weight_0_to_fp16, x = input_59_cast_fp16)[name = string("op_926_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_9_cast_fp16 = add(x = inputs_7_cast_fp16, y = var_926_cast_fp16)[name = string("inputs_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_9_cast_fp16 = mul(x = inputs_9_cast_fp16, y = inputs_9_cast_fp16)[name = string("inputs_sq_9_cast_fp16")];
+            tensor<int32, [1]> variance_9_axes_0 = const()[name = string("variance_9_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_9_keep_dims_0 = const()[name = string("variance_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_9_cast_fp16 = reduce_mean(axes = variance_9_axes_0, keep_dims = variance_9_keep_dims_0, x = inputs_sq_9_cast_fp16)[name = string("variance_9_cast_fp16")];
+            fp16 var_942_to_fp16 = const()[name = string("op_942_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_943_cast_fp16 = add(x = variance_9_cast_fp16, y = var_942_to_fp16)[name = string("op_943_cast_fp16")];
+            fp32 var_944_epsilon_0 = const()[name = string("op_944_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_944_cast_fp16 = rsqrt(epsilon = var_944_epsilon_0, x = var_943_cast_fp16)[name = string("op_944_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_9_cast_fp16 = mul(x = inputs_9_cast_fp16, y = var_944_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_9_to_fp16 = const()[name = string("w_9_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36255808)))];
+            tensor<fp16, [1, 512, 1, 1]> x_15_cast_fp16 = mul(x = w_9_to_fp16, y = hidden_states_9_cast_fp16)[name = string("x_15_cast_fp16")];
+            string q_17_pad_type_0 = const()[name = string("q_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_17_strides_0 = const()[name = string("q_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_17_pad_0 = const()[name = string("q_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_17_dilations_0 = const()[name = string("q_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_17_groups_0 = const()[name = string("q_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_2_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36256896)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_17_cast_fp16 = conv(dilations = q_17_dilations_0, groups = q_17_groups_0, pad = q_17_pad_0, pad_type = q_17_pad_type_0, strides = q_17_strides_0, weight = decoder_pre_transformer_layers_2_self_attn_q_proj_weight_to_fp16, x = x_15_cast_fp16)[name = string("q_17_cast_fp16")];
+            string k_9_pad_type_0 = const()[name = string("k_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_9_strides_0 = const()[name = string("k_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_9_pad_0 = const()[name = string("k_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_9_dilations_0 = const()[name = string("k_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_9_groups_0 = const()[name = string("k_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_2_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37305536)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_9_cast_fp16 = conv(dilations = k_9_dilations_0, groups = k_9_groups_0, pad = k_9_pad_0, pad_type = k_9_pad_type_0, strides = k_9_strides_0, weight = decoder_pre_transformer_layers_2_self_attn_k_proj_weight_to_fp16, x = x_15_cast_fp16)[name = string("k_9_cast_fp16")];
+            string v_5_pad_type_0 = const()[name = string("v_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_5_strides_0 = const()[name = string("v_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_5_pad_0 = const()[name = string("v_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_5_dilations_0 = const()[name = string("v_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_5_groups_0 = const()[name = string("v_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_2_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38354176)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_5_cast_fp16 = conv(dilations = v_5_dilations_0, groups = v_5_groups_0, pad = v_5_pad_0, pad_type = v_5_pad_type_0, strides = v_5_strides_0, weight = decoder_pre_transformer_layers_2_self_attn_v_proj_weight_to_fp16, x = x_15_cast_fp16)[name = string("v_5_cast_fp16")];
+            tensor<int32, [4]> var_976 = const()[name = string("op_976"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_19_cast_fp16 = reshape(shape = var_976, x = q_17_cast_fp16)[name = string("q_19_cast_fp16")];
+            tensor<int32, [4]> var_981 = const()[name = string("op_981"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_11_cast_fp16 = reshape(shape = var_981, x = k_9_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_993_cast_fp16 = mul(x = q_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_993_cast_fp16")];
+            tensor<int32, [4]> var_998_begin_0 = const()[name = string("op_998_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_998_end_0 = const()[name = string("op_998_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_998_end_mask_0 = const()[name = string("op_998_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_998_cast_fp16 = slice_by_index(begin = var_998_begin_0, end = var_998_end_0, end_mask = var_998_end_mask_0, x = q_19_cast_fp16)[name = string("op_998_cast_fp16")];
+            tensor<int32, [4]> var_1005_begin_0 = const()[name = string("op_1005_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1005_end_0 = const()[name = string("op_1005_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1005_end_mask_0 = const()[name = string("op_1005_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1005_cast_fp16 = slice_by_index(begin = var_1005_begin_0, end = var_1005_end_0, end_mask = var_1005_end_mask_0, x = q_19_cast_fp16)[name = string("op_1005_cast_fp16")];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1008_cast_fp16 = mul(x = var_1005_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_1008_cast_fp16")];
+            bool var_1010_interleave_0 = const()[name = string("op_1010_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1010_cast_fp16 = concat(axis = var_32, interleave = var_1010_interleave_0, values = (var_1008_cast_fp16, var_998_cast_fp16))[name = string("op_1010_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1011_cast_fp16 = mul(x = var_1010_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1011_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_5_cast_fp16 = add(x = var_993_cast_fp16, y = var_1011_cast_fp16)[name = string("q_rotated_5_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1013_cast_fp16 = mul(x = k_11_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1013_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = string("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = string("op_1018_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = string("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_11_cast_fp16)[name = string("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1025_begin_0 = const()[name = string("op_1025_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1025_end_0 = const()[name = string("op_1025_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1025_end_mask_0 = const()[name = string("op_1025_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1025_cast_fp16 = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = k_11_cast_fp16)[name = string("op_1025_cast_fp16")];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1028_cast_fp16 = mul(x = var_1025_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_1028_cast_fp16")];
+            bool var_1030_interleave_0 = const()[name = string("op_1030_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1030_cast_fp16 = concat(axis = var_32, interleave = var_1030_interleave_0, values = (var_1028_cast_fp16, var_1018_cast_fp16))[name = string("op_1030_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1031_cast_fp16 = mul(x = var_1030_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1031_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_5_cast_fp16 = add(x = var_1013_cast_fp16, y = var_1031_cast_fp16)[name = string("k_rotated_5_cast_fp16")];
+            tensor<int32, [4]> var_1035 = const()[name = string("op_1035"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_5_cast_fp16 = reshape(shape = var_1035, x = k_rotated_5_cast_fp16)[name = string("current_key_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1040_cast_fp16 = mul(x = var_522_cast_fp16_2, y = var_647_cast_fp16)[name = string("op_1040_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1041_cast_fp16 = mul(x = current_key_5_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1041_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_5_cast_fp16 = add(x = var_1040_cast_fp16, y = var_1041_cast_fp16)[name = string("key_cache_updated_5_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1044_cast_fp16 = mul(x = var_531_cast_fp16_2, y = var_647_cast_fp16)[name = string("op_1044_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1045_cast_fp16 = mul(x = v_5_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1045_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_5_cast_fp16 = add(x = var_1044_cast_fp16, y = var_1045_cast_fp16)[name = string("value_cache_updated_5_cast_fp16")];
+            tensor<int32, [4]> var_1047 = const()[name = string("op_1047"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_23_cast_fp16 = reshape(shape = var_1047, x = q_rotated_5_cast_fp16)[name = string("q_23_cast_fp16")];
+            tensor<int32, [4]> var_1050 = const()[name = string("op_1050"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_5_cast_fp16 = reshape(shape = var_1050, x = key_cache_updated_5_cast_fp16)[name = string("k_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> var_1052 = const()[name = string("op_1052"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_5_cast_fp16 = reshape(shape = var_1052, x = value_cache_updated_5_cast_fp16)[name = string("v_for_attn_5_cast_fp16")];
+            bool var_1056_transpose_x_1 = const()[name = string("op_1056_transpose_x_1"), val = bool(true)];
+            bool var_1056_transpose_y_1 = const()[name = string("op_1056_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1056_cast_fp16 = matmul(transpose_x = var_1056_transpose_x_1, transpose_y = var_1056_transpose_y_1, x = q_23_cast_fp16, y = k_for_attn_5_cast_fp16)[name = string("op_1056_cast_fp16")];
+            fp16 var_1057_to_fp16 = const()[name = string("op_1057_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_9_cast_fp16 = mul(x = var_1056_cast_fp16, y = var_1057_to_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_61_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_61_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_11_cast_fp16 = softmax(axis = var_28, x = input_61_cast_fp16)[name = string("attn_weights_11_cast_fp16")];
+            bool attn_output_9_transpose_x_1 = const()[name = string("attn_output_9_transpose_x_1"), val = bool(false)];
+            bool attn_output_9_transpose_y_1 = const()[name = string("attn_output_9_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_9_cast_fp16 = matmul(transpose_x = attn_output_9_transpose_x_1, transpose_y = attn_output_9_transpose_y_1, x = attn_weights_11_cast_fp16, y = v_for_attn_5_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [4]> var_1066 = const()[name = string("op_1066"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1068 = const()[name = string("op_1068"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_11_cast_fp16 = transpose(perm = var_1066, x = attn_output_9_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 1024, 1, 1]> input_63_cast_fp16 = reshape(shape = var_1068, x = attn_output_11_cast_fp16)[name = string("input_63_cast_fp16")];
+            string x_17_pad_type_0 = const()[name = string("x_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_17_strides_0 = const()[name = string("x_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_17_pad_0 = const()[name = string("x_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_17_dilations_0 = const()[name = string("x_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_17_groups_0 = const()[name = string("x_17_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1083_weight_0_to_fp16 = const()[name = string("op_1083_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39402816)))];
+            tensor<fp16, [512]> var_1083_bias_0_to_fp16 = const()[name = string("op_1083_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40451456)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1083_cast_fp16 = conv(bias = var_1083_bias_0_to_fp16, dilations = x_17_dilations_0, groups = x_17_groups_0, pad = x_17_pad_0, pad_type = x_17_pad_type_0, strides = x_17_strides_0, weight = var_1083_weight_0_to_fp16, x = input_63_cast_fp16)[name = string("op_1083_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_11_cast_fp16 = add(x = inputs_9_cast_fp16, y = var_1083_cast_fp16)[name = string("inputs_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_11_cast_fp16 = mul(x = inputs_11_cast_fp16, y = inputs_11_cast_fp16)[name = string("inputs_sq_11_cast_fp16")];
+            tensor<int32, [1]> variance_11_axes_0 = const()[name = string("variance_11_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_11_keep_dims_0 = const()[name = string("variance_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_11_cast_fp16 = reduce_mean(axes = variance_11_axes_0, keep_dims = variance_11_keep_dims_0, x = inputs_sq_11_cast_fp16)[name = string("variance_11_cast_fp16")];
+            fp16 var_1089_to_fp16 = const()[name = string("op_1089_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1090_cast_fp16 = add(x = variance_11_cast_fp16, y = var_1089_to_fp16)[name = string("op_1090_cast_fp16")];
+            fp32 var_1091_epsilon_0 = const()[name = string("op_1091_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1091_cast_fp16 = rsqrt(epsilon = var_1091_epsilon_0, x = var_1090_cast_fp16)[name = string("op_1091_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_11_cast_fp16 = mul(x = inputs_11_cast_fp16, y = var_1091_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_11_to_fp16 = const()[name = string("w_11_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40452544)))];
+            tensor<fp16, [1, 512, 1, 1]> input_65_cast_fp16 = mul(x = w_11_to_fp16, y = hidden_states_11_cast_fp16)[name = string("input_65_cast_fp16")];
+            string input_67_pad_type_0 = const()[name = string("input_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_67_strides_0 = const()[name = string("input_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_67_pad_0 = const()[name = string("input_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_67_dilations_0 = const()[name = string("input_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_67_groups_0 = const()[name = string("input_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_2_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40453632)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_67_cast_fp16 = conv(dilations = input_67_dilations_0, groups = input_67_groups_0, pad = input_67_pad_0, pad_type = input_67_pad_type_0, strides = input_67_strides_0, weight = decoder_pre_transformer_layers_2_mlp_gate_proj_weight_to_fp16, x = input_65_cast_fp16)[name = string("input_67_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1105_cast_fp16 = silu(x = input_67_cast_fp16)[name = string("op_1105_cast_fp16")];
+            string var_1111_pad_type_0 = const()[name = string("op_1111_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1111_strides_0 = const()[name = string("op_1111_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1111_pad_0 = const()[name = string("op_1111_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1111_dilations_0 = const()[name = string("op_1111_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1111_groups_0 = const()[name = string("op_1111_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_2_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_2_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41502272)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1111_cast_fp16 = conv(dilations = var_1111_dilations_0, groups = var_1111_groups_0, pad = var_1111_pad_0, pad_type = var_1111_pad_type_0, strides = var_1111_strides_0, weight = decoder_pre_transformer_layers_2_mlp_up_proj_weight_to_fp16, x = input_65_cast_fp16)[name = string("op_1111_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_69_cast_fp16 = mul(x = var_1105_cast_fp16, y = var_1111_cast_fp16)[name = string("input_69_cast_fp16")];
+            string x_19_pad_type_0 = const()[name = string("x_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_19_strides_0 = const()[name = string("x_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_19_pad_0 = const()[name = string("x_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_19_dilations_0 = const()[name = string("x_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_19_groups_0 = const()[name = string("x_19_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1122_weight_0_to_fp16 = const()[name = string("op_1122_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42550912)))];
+            tensor<fp16, [512]> var_1122_bias_0_to_fp16 = const()[name = string("op_1122_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43599552)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1122_cast_fp16 = conv(bias = var_1122_bias_0_to_fp16, dilations = x_19_dilations_0, groups = x_19_groups_0, pad = x_19_pad_0, pad_type = x_19_pad_type_0, strides = x_19_strides_0, weight = var_1122_weight_0_to_fp16, x = input_69_cast_fp16)[name = string("op_1122_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_13_cast_fp16 = add(x = inputs_11_cast_fp16, y = var_1122_cast_fp16)[name = string("inputs_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_13_cast_fp16 = mul(x = inputs_13_cast_fp16, y = inputs_13_cast_fp16)[name = string("inputs_sq_13_cast_fp16")];
+            tensor<int32, [1]> variance_13_axes_0 = const()[name = string("variance_13_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_13_keep_dims_0 = const()[name = string("variance_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_13_cast_fp16 = reduce_mean(axes = variance_13_axes_0, keep_dims = variance_13_keep_dims_0, x = inputs_sq_13_cast_fp16)[name = string("variance_13_cast_fp16")];
+            fp16 var_1138_to_fp16 = const()[name = string("op_1138_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1139_cast_fp16 = add(x = variance_13_cast_fp16, y = var_1138_to_fp16)[name = string("op_1139_cast_fp16")];
+            fp32 var_1140_epsilon_0 = const()[name = string("op_1140_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1140_cast_fp16 = rsqrt(epsilon = var_1140_epsilon_0, x = var_1139_cast_fp16)[name = string("op_1140_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_13_cast_fp16 = mul(x = inputs_13_cast_fp16, y = var_1140_cast_fp16)[name = string("hidden_states_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_13_to_fp16 = const()[name = string("w_13_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43600640)))];
+            tensor<fp16, [1, 512, 1, 1]> x_21_cast_fp16 = mul(x = w_13_to_fp16, y = hidden_states_13_cast_fp16)[name = string("x_21_cast_fp16")];
+            string q_25_pad_type_0 = const()[name = string("q_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_25_strides_0 = const()[name = string("q_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_25_pad_0 = const()[name = string("q_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_25_dilations_0 = const()[name = string("q_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_25_groups_0 = const()[name = string("q_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_3_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43601728)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_25_cast_fp16 = conv(dilations = q_25_dilations_0, groups = q_25_groups_0, pad = q_25_pad_0, pad_type = q_25_pad_type_0, strides = q_25_strides_0, weight = decoder_pre_transformer_layers_3_self_attn_q_proj_weight_to_fp16, x = x_21_cast_fp16)[name = string("q_25_cast_fp16")];
+            string k_13_pad_type_0 = const()[name = string("k_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_13_strides_0 = const()[name = string("k_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = string("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_13_dilations_0 = const()[name = string("k_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_13_groups_0 = const()[name = string("k_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_3_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44650368)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_13_cast_fp16 = conv(dilations = k_13_dilations_0, groups = k_13_groups_0, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = k_13_strides_0, weight = decoder_pre_transformer_layers_3_self_attn_k_proj_weight_to_fp16, x = x_21_cast_fp16)[name = string("k_13_cast_fp16")];
+            string v_7_pad_type_0 = const()[name = string("v_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_7_strides_0 = const()[name = string("v_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_7_pad_0 = const()[name = string("v_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_7_dilations_0 = const()[name = string("v_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_7_groups_0 = const()[name = string("v_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_3_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(45699008)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_7_cast_fp16 = conv(dilations = v_7_dilations_0, groups = v_7_groups_0, pad = v_7_pad_0, pad_type = v_7_pad_type_0, strides = v_7_strides_0, weight = decoder_pre_transformer_layers_3_self_attn_v_proj_weight_to_fp16, x = x_21_cast_fp16)[name = string("v_7_cast_fp16")];
+            tensor<int32, [4]> var_1172 = const()[name = string("op_1172"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_27_cast_fp16 = reshape(shape = var_1172, x = q_25_cast_fp16)[name = string("q_27_cast_fp16")];
+            tensor<int32, [4]> var_1177 = const()[name = string("op_1177"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_15_cast_fp16 = reshape(shape = var_1177, x = k_13_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1189_cast_fp16 = mul(x = q_27_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1189_cast_fp16")];
+            tensor<int32, [4]> var_1194_begin_0 = const()[name = string("op_1194_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1194_end_0 = const()[name = string("op_1194_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1194_end_mask_0 = const()[name = string("op_1194_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1194_cast_fp16 = slice_by_index(begin = var_1194_begin_0, end = var_1194_end_0, end_mask = var_1194_end_mask_0, x = q_27_cast_fp16)[name = string("op_1194_cast_fp16")];
+            tensor<int32, [4]> var_1201_begin_0 = const()[name = string("op_1201_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1201_end_0 = const()[name = string("op_1201_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1201_end_mask_0 = const()[name = string("op_1201_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1201_cast_fp16 = slice_by_index(begin = var_1201_begin_0, end = var_1201_end_0, end_mask = var_1201_end_mask_0, x = q_27_cast_fp16)[name = string("op_1201_cast_fp16")];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1204_cast_fp16 = mul(x = var_1201_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_1204_cast_fp16")];
+            bool var_1206_interleave_0 = const()[name = string("op_1206_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1206_cast_fp16 = concat(axis = var_32, interleave = var_1206_interleave_0, values = (var_1204_cast_fp16, var_1194_cast_fp16))[name = string("op_1206_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1207_cast_fp16 = mul(x = var_1206_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1207_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_7_cast_fp16 = add(x = var_1189_cast_fp16, y = var_1207_cast_fp16)[name = string("q_rotated_7_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1209_cast_fp16 = mul(x = k_15_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1209_cast_fp16")];
+            tensor<int32, [4]> var_1214_begin_0 = const()[name = string("op_1214_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1214_end_0 = const()[name = string("op_1214_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1214_end_mask_0 = const()[name = string("op_1214_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1214_cast_fp16 = slice_by_index(begin = var_1214_begin_0, end = var_1214_end_0, end_mask = var_1214_end_mask_0, x = k_15_cast_fp16)[name = string("op_1214_cast_fp16")];
+            tensor<int32, [4]> var_1221_begin_0 = const()[name = string("op_1221_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1221_end_0 = const()[name = string("op_1221_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1221_end_mask_0 = const()[name = string("op_1221_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1221_cast_fp16 = slice_by_index(begin = var_1221_begin_0, end = var_1221_end_0, end_mask = var_1221_end_mask_0, x = k_15_cast_fp16)[name = string("op_1221_cast_fp16")];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1224_cast_fp16 = mul(x = var_1221_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_1224_cast_fp16")];
+            bool var_1226_interleave_0 = const()[name = string("op_1226_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1226_cast_fp16 = concat(axis = var_32, interleave = var_1226_interleave_0, values = (var_1224_cast_fp16, var_1214_cast_fp16))[name = string("op_1226_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1227_cast_fp16 = mul(x = var_1226_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1227_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_7_cast_fp16 = add(x = var_1209_cast_fp16, y = var_1227_cast_fp16)[name = string("k_rotated_7_cast_fp16")];
+            tensor<int32, [4]> var_1231 = const()[name = string("op_1231"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_7_cast_fp16 = reshape(shape = var_1231, x = k_rotated_7_cast_fp16)[name = string("current_key_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1236_cast_fp16 = mul(x = var_522_cast_fp16_3, y = var_647_cast_fp16)[name = string("op_1236_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1237_cast_fp16 = mul(x = current_key_7_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1237_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_7_cast_fp16 = add(x = var_1236_cast_fp16, y = var_1237_cast_fp16)[name = string("key_cache_updated_7_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1240_cast_fp16 = mul(x = var_531_cast_fp16_3, y = var_647_cast_fp16)[name = string("op_1240_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1241_cast_fp16 = mul(x = v_7_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1241_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_7_cast_fp16 = add(x = var_1240_cast_fp16, y = var_1241_cast_fp16)[name = string("value_cache_updated_7_cast_fp16")];
+            tensor<int32, [4]> var_1243 = const()[name = string("op_1243"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_31_cast_fp16 = reshape(shape = var_1243, x = q_rotated_7_cast_fp16)[name = string("q_31_cast_fp16")];
+            tensor<int32, [4]> var_1246 = const()[name = string("op_1246"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_7_cast_fp16 = reshape(shape = var_1246, x = key_cache_updated_7_cast_fp16)[name = string("k_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> var_1248 = const()[name = string("op_1248"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_7_cast_fp16 = reshape(shape = var_1248, x = value_cache_updated_7_cast_fp16)[name = string("v_for_attn_7_cast_fp16")];
+            bool var_1252_transpose_x_1 = const()[name = string("op_1252_transpose_x_1"), val = bool(true)];
+            bool var_1252_transpose_y_1 = const()[name = string("op_1252_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1252_cast_fp16 = matmul(transpose_x = var_1252_transpose_x_1, transpose_y = var_1252_transpose_y_1, x = q_31_cast_fp16, y = k_for_attn_7_cast_fp16)[name = string("op_1252_cast_fp16")];
+            fp16 var_1253_to_fp16 = const()[name = string("op_1253_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_13_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_71_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_71_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_15_cast_fp16 = softmax(axis = var_28, x = input_71_cast_fp16)[name = string("attn_weights_15_cast_fp16")];
+            bool attn_output_13_transpose_x_1 = const()[name = string("attn_output_13_transpose_x_1"), val = bool(false)];
+            bool attn_output_13_transpose_y_1 = const()[name = string("attn_output_13_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_1, transpose_y = attn_output_13_transpose_y_1, x = attn_weights_15_cast_fp16, y = v_for_attn_7_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1262 = const()[name = string("op_1262"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1264 = const()[name = string("op_1264"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_15_cast_fp16 = transpose(perm = var_1262, x = attn_output_13_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 1024, 1, 1]> input_73_cast_fp16 = reshape(shape = var_1264, x = attn_output_15_cast_fp16)[name = string("input_73_cast_fp16")];
+            string x_23_pad_type_0 = const()[name = string("x_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_23_strides_0 = const()[name = string("x_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_23_pad_0 = const()[name = string("x_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_23_dilations_0 = const()[name = string("x_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_23_groups_0 = const()[name = string("x_23_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1279_weight_0_to_fp16 = const()[name = string("op_1279_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46747648)))];
+            tensor<fp16, [512]> var_1279_bias_0_to_fp16 = const()[name = string("op_1279_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47796288)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1279_cast_fp16 = conv(bias = var_1279_bias_0_to_fp16, dilations = x_23_dilations_0, groups = x_23_groups_0, pad = x_23_pad_0, pad_type = x_23_pad_type_0, strides = x_23_strides_0, weight = var_1279_weight_0_to_fp16, x = input_73_cast_fp16)[name = string("op_1279_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = var_1279_cast_fp16)[name = string("inputs_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_15_cast_fp16 = mul(x = inputs_15_cast_fp16, y = inputs_15_cast_fp16)[name = string("inputs_sq_15_cast_fp16")];
+            tensor<int32, [1]> variance_15_axes_0 = const()[name = string("variance_15_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_15_keep_dims_0 = const()[name = string("variance_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_15_cast_fp16 = reduce_mean(axes = variance_15_axes_0, keep_dims = variance_15_keep_dims_0, x = inputs_sq_15_cast_fp16)[name = string("variance_15_cast_fp16")];
+            fp16 var_1285_to_fp16 = const()[name = string("op_1285_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1286_cast_fp16 = add(x = variance_15_cast_fp16, y = var_1285_to_fp16)[name = string("op_1286_cast_fp16")];
+            fp32 var_1287_epsilon_0 = const()[name = string("op_1287_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1287_cast_fp16 = rsqrt(epsilon = var_1287_epsilon_0, x = var_1286_cast_fp16)[name = string("op_1287_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_15_cast_fp16 = mul(x = inputs_15_cast_fp16, y = var_1287_cast_fp16)[name = string("hidden_states_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_15_to_fp16 = const()[name = string("w_15_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47797376)))];
+            tensor<fp16, [1, 512, 1, 1]> input_75_cast_fp16 = mul(x = w_15_to_fp16, y = hidden_states_15_cast_fp16)[name = string("input_75_cast_fp16")];
+            string input_77_pad_type_0 = const()[name = string("input_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_77_strides_0 = const()[name = string("input_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_77_pad_0 = const()[name = string("input_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_77_dilations_0 = const()[name = string("input_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_77_groups_0 = const()[name = string("input_77_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_3_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47798464)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_77_cast_fp16 = conv(dilations = input_77_dilations_0, groups = input_77_groups_0, pad = input_77_pad_0, pad_type = input_77_pad_type_0, strides = input_77_strides_0, weight = decoder_pre_transformer_layers_3_mlp_gate_proj_weight_to_fp16, x = input_75_cast_fp16)[name = string("input_77_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1301_cast_fp16 = silu(x = input_77_cast_fp16)[name = string("op_1301_cast_fp16")];
+            string var_1307_pad_type_0 = const()[name = string("op_1307_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1307_strides_0 = const()[name = string("op_1307_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1307_pad_0 = const()[name = string("op_1307_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1307_dilations_0 = const()[name = string("op_1307_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1307_groups_0 = const()[name = string("op_1307_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_3_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_3_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(48847104)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1307_cast_fp16 = conv(dilations = var_1307_dilations_0, groups = var_1307_groups_0, pad = var_1307_pad_0, pad_type = var_1307_pad_type_0, strides = var_1307_strides_0, weight = decoder_pre_transformer_layers_3_mlp_up_proj_weight_to_fp16, x = input_75_cast_fp16)[name = string("op_1307_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_79_cast_fp16 = mul(x = var_1301_cast_fp16, y = var_1307_cast_fp16)[name = string("input_79_cast_fp16")];
+            string x_25_pad_type_0 = const()[name = string("x_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_25_strides_0 = const()[name = string("x_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_25_pad_0 = const()[name = string("x_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_25_dilations_0 = const()[name = string("x_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_25_groups_0 = const()[name = string("x_25_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1318_weight_0_to_fp16 = const()[name = string("op_1318_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49895744)))];
+            tensor<fp16, [512]> var_1318_bias_0_to_fp16 = const()[name = string("op_1318_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50944384)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1318_cast_fp16 = conv(bias = var_1318_bias_0_to_fp16, dilations = x_25_dilations_0, groups = x_25_groups_0, pad = x_25_pad_0, pad_type = x_25_pad_type_0, strides = x_25_strides_0, weight = var_1318_weight_0_to_fp16, x = input_79_cast_fp16)[name = string("op_1318_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_17_cast_fp16 = add(x = inputs_15_cast_fp16, y = var_1318_cast_fp16)[name = string("inputs_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_17_cast_fp16 = mul(x = inputs_17_cast_fp16, y = inputs_17_cast_fp16)[name = string("inputs_sq_17_cast_fp16")];
+            tensor<int32, [1]> variance_17_axes_0 = const()[name = string("variance_17_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_17_keep_dims_0 = const()[name = string("variance_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_17_cast_fp16 = reduce_mean(axes = variance_17_axes_0, keep_dims = variance_17_keep_dims_0, x = inputs_sq_17_cast_fp16)[name = string("variance_17_cast_fp16")];
+            fp16 var_1334_to_fp16 = const()[name = string("op_1334_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1335_cast_fp16 = add(x = variance_17_cast_fp16, y = var_1334_to_fp16)[name = string("op_1335_cast_fp16")];
+            fp32 var_1336_epsilon_0 = const()[name = string("op_1336_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1336_cast_fp16 = rsqrt(epsilon = var_1336_epsilon_0, x = var_1335_cast_fp16)[name = string("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_17_cast_fp16 = mul(x = inputs_17_cast_fp16, y = var_1336_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_17_to_fp16 = const()[name = string("w_17_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50945472)))];
+            tensor<fp16, [1, 512, 1, 1]> x_27_cast_fp16 = mul(x = w_17_to_fp16, y = hidden_states_17_cast_fp16)[name = string("x_27_cast_fp16")];
+            string q_33_pad_type_0 = const()[name = string("q_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_33_strides_0 = const()[name = string("q_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_33_pad_0 = const()[name = string("q_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_33_dilations_0 = const()[name = string("q_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_33_groups_0 = const()[name = string("q_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_4_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50946560)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_33_cast_fp16 = conv(dilations = q_33_dilations_0, groups = q_33_groups_0, pad = q_33_pad_0, pad_type = q_33_pad_type_0, strides = q_33_strides_0, weight = decoder_pre_transformer_layers_4_self_attn_q_proj_weight_to_fp16, x = x_27_cast_fp16)[name = string("q_33_cast_fp16")];
+            string k_17_pad_type_0 = const()[name = string("k_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_17_strides_0 = const()[name = string("k_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_17_pad_0 = const()[name = string("k_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_17_dilations_0 = const()[name = string("k_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_17_groups_0 = const()[name = string("k_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_4_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51995200)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_17_cast_fp16 = conv(dilations = k_17_dilations_0, groups = k_17_groups_0, pad = k_17_pad_0, pad_type = k_17_pad_type_0, strides = k_17_strides_0, weight = decoder_pre_transformer_layers_4_self_attn_k_proj_weight_to_fp16, x = x_27_cast_fp16)[name = string("k_17_cast_fp16")];
+            string v_9_pad_type_0 = const()[name = string("v_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_9_strides_0 = const()[name = string("v_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_9_pad_0 = const()[name = string("v_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_9_dilations_0 = const()[name = string("v_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_9_groups_0 = const()[name = string("v_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_4_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53043840)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_9_cast_fp16 = conv(dilations = v_9_dilations_0, groups = v_9_groups_0, pad = v_9_pad_0, pad_type = v_9_pad_type_0, strides = v_9_strides_0, weight = decoder_pre_transformer_layers_4_self_attn_v_proj_weight_to_fp16, x = x_27_cast_fp16)[name = string("v_9_cast_fp16")];
+            tensor<int32, [4]> var_1368 = const()[name = string("op_1368"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_35_cast_fp16 = reshape(shape = var_1368, x = q_33_cast_fp16)[name = string("q_35_cast_fp16")];
+            tensor<int32, [4]> var_1373 = const()[name = string("op_1373"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_19_cast_fp16 = reshape(shape = var_1373, x = k_17_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1385_cast_fp16 = mul(x = q_35_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1385_cast_fp16")];
+            tensor<int32, [4]> var_1390_begin_0 = const()[name = string("op_1390_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1390_end_0 = const()[name = string("op_1390_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1390_end_mask_0 = const()[name = string("op_1390_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1390_cast_fp16 = slice_by_index(begin = var_1390_begin_0, end = var_1390_end_0, end_mask = var_1390_end_mask_0, x = q_35_cast_fp16)[name = string("op_1390_cast_fp16")];
+            tensor<int32, [4]> var_1397_begin_0 = const()[name = string("op_1397_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1397_end_0 = const()[name = string("op_1397_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1397_end_mask_0 = const()[name = string("op_1397_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1397_cast_fp16 = slice_by_index(begin = var_1397_begin_0, end = var_1397_end_0, end_mask = var_1397_end_mask_0, x = q_35_cast_fp16)[name = string("op_1397_cast_fp16")];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1400_cast_fp16 = mul(x = var_1397_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_1400_cast_fp16")];
+            bool var_1402_interleave_0 = const()[name = string("op_1402_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1402_cast_fp16 = concat(axis = var_32, interleave = var_1402_interleave_0, values = (var_1400_cast_fp16, var_1390_cast_fp16))[name = string("op_1402_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1403_cast_fp16 = mul(x = var_1402_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1403_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_9_cast_fp16 = add(x = var_1385_cast_fp16, y = var_1403_cast_fp16)[name = string("q_rotated_9_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1405_cast_fp16 = mul(x = k_19_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1405_cast_fp16")];
+            tensor<int32, [4]> var_1410_begin_0 = const()[name = string("op_1410_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1410_end_0 = const()[name = string("op_1410_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1410_end_mask_0 = const()[name = string("op_1410_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1410_cast_fp16 = slice_by_index(begin = var_1410_begin_0, end = var_1410_end_0, end_mask = var_1410_end_mask_0, x = k_19_cast_fp16)[name = string("op_1410_cast_fp16")];
+            tensor<int32, [4]> var_1417_begin_0 = const()[name = string("op_1417_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1417_end_0 = const()[name = string("op_1417_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1417_end_mask_0 = const()[name = string("op_1417_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1417_cast_fp16 = slice_by_index(begin = var_1417_begin_0, end = var_1417_end_0, end_mask = var_1417_end_mask_0, x = k_19_cast_fp16)[name = string("op_1417_cast_fp16")];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1420_cast_fp16 = mul(x = var_1417_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_1420_cast_fp16")];
+            bool var_1422_interleave_0 = const()[name = string("op_1422_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1422_cast_fp16 = concat(axis = var_32, interleave = var_1422_interleave_0, values = (var_1420_cast_fp16, var_1410_cast_fp16))[name = string("op_1422_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1423_cast_fp16 = mul(x = var_1422_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1423_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_9_cast_fp16 = add(x = var_1405_cast_fp16, y = var_1423_cast_fp16)[name = string("k_rotated_9_cast_fp16")];
+            tensor<int32, [4]> var_1427 = const()[name = string("op_1427"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_9_cast_fp16 = reshape(shape = var_1427, x = k_rotated_9_cast_fp16)[name = string("current_key_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1432_cast_fp16 = mul(x = var_522_cast_fp16_4, y = var_647_cast_fp16)[name = string("op_1432_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1433_cast_fp16 = mul(x = current_key_9_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1433_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_9_cast_fp16 = add(x = var_1432_cast_fp16, y = var_1433_cast_fp16)[name = string("key_cache_updated_9_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1436_cast_fp16 = mul(x = var_531_cast_fp16_4, y = var_647_cast_fp16)[name = string("op_1436_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1437_cast_fp16 = mul(x = v_9_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1437_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_9_cast_fp16 = add(x = var_1436_cast_fp16, y = var_1437_cast_fp16)[name = string("value_cache_updated_9_cast_fp16")];
+            tensor<int32, [4]> var_1439 = const()[name = string("op_1439"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_39_cast_fp16 = reshape(shape = var_1439, x = q_rotated_9_cast_fp16)[name = string("q_39_cast_fp16")];
+            tensor<int32, [4]> var_1442 = const()[name = string("op_1442"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_9_cast_fp16 = reshape(shape = var_1442, x = key_cache_updated_9_cast_fp16)[name = string("k_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> var_1444 = const()[name = string("op_1444"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_9_cast_fp16 = reshape(shape = var_1444, x = value_cache_updated_9_cast_fp16)[name = string("v_for_attn_9_cast_fp16")];
+            bool var_1448_transpose_x_1 = const()[name = string("op_1448_transpose_x_1"), val = bool(true)];
+            bool var_1448_transpose_y_1 = const()[name = string("op_1448_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1448_cast_fp16 = matmul(transpose_x = var_1448_transpose_x_1, transpose_y = var_1448_transpose_y_1, x = q_39_cast_fp16, y = k_for_attn_9_cast_fp16)[name = string("op_1448_cast_fp16")];
+            fp16 var_1449_to_fp16 = const()[name = string("op_1449_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_17_cast_fp16 = mul(x = var_1448_cast_fp16, y = var_1449_to_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_81_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_81_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_19_cast_fp16 = softmax(axis = var_28, x = input_81_cast_fp16)[name = string("attn_weights_19_cast_fp16")];
+            bool attn_output_17_transpose_x_1 = const()[name = string("attn_output_17_transpose_x_1"), val = bool(false)];
+            bool attn_output_17_transpose_y_1 = const()[name = string("attn_output_17_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_17_cast_fp16 = matmul(transpose_x = attn_output_17_transpose_x_1, transpose_y = attn_output_17_transpose_y_1, x = attn_weights_19_cast_fp16, y = v_for_attn_9_cast_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<int32, [4]> var_1458 = const()[name = string("op_1458"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1460 = const()[name = string("op_1460"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_19_cast_fp16 = transpose(perm = var_1458, x = attn_output_17_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1024, 1, 1]> input_83_cast_fp16 = reshape(shape = var_1460, x = attn_output_19_cast_fp16)[name = string("input_83_cast_fp16")];
+            string x_29_pad_type_0 = const()[name = string("x_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_29_strides_0 = const()[name = string("x_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_29_pad_0 = const()[name = string("x_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_29_dilations_0 = const()[name = string("x_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_29_groups_0 = const()[name = string("x_29_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1475_weight_0_to_fp16 = const()[name = string("op_1475_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54092480)))];
+            tensor<fp16, [512]> var_1475_bias_0_to_fp16 = const()[name = string("op_1475_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55141120)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1475_cast_fp16 = conv(bias = var_1475_bias_0_to_fp16, dilations = x_29_dilations_0, groups = x_29_groups_0, pad = x_29_pad_0, pad_type = x_29_pad_type_0, strides = x_29_strides_0, weight = var_1475_weight_0_to_fp16, x = input_83_cast_fp16)[name = string("op_1475_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_19_cast_fp16 = add(x = inputs_17_cast_fp16, y = var_1475_cast_fp16)[name = string("inputs_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_19_cast_fp16 = mul(x = inputs_19_cast_fp16, y = inputs_19_cast_fp16)[name = string("inputs_sq_19_cast_fp16")];
+            tensor<int32, [1]> variance_19_axes_0 = const()[name = string("variance_19_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_19_keep_dims_0 = const()[name = string("variance_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_19_cast_fp16 = reduce_mean(axes = variance_19_axes_0, keep_dims = variance_19_keep_dims_0, x = inputs_sq_19_cast_fp16)[name = string("variance_19_cast_fp16")];
+            fp16 var_1481_to_fp16 = const()[name = string("op_1481_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1482_cast_fp16 = add(x = variance_19_cast_fp16, y = var_1481_to_fp16)[name = string("op_1482_cast_fp16")];
+            fp32 var_1483_epsilon_0 = const()[name = string("op_1483_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1483_cast_fp16 = rsqrt(epsilon = var_1483_epsilon_0, x = var_1482_cast_fp16)[name = string("op_1483_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_19_cast_fp16 = mul(x = inputs_19_cast_fp16, y = var_1483_cast_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_19_to_fp16 = const()[name = string("w_19_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55142208)))];
+            tensor<fp16, [1, 512, 1, 1]> input_85_cast_fp16 = mul(x = w_19_to_fp16, y = hidden_states_19_cast_fp16)[name = string("input_85_cast_fp16")];
+            string input_87_pad_type_0 = const()[name = string("input_87_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_87_strides_0 = const()[name = string("input_87_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_87_pad_0 = const()[name = string("input_87_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_87_dilations_0 = const()[name = string("input_87_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_87_groups_0 = const()[name = string("input_87_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_4_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55143296)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_87_cast_fp16 = conv(dilations = input_87_dilations_0, groups = input_87_groups_0, pad = input_87_pad_0, pad_type = input_87_pad_type_0, strides = input_87_strides_0, weight = decoder_pre_transformer_layers_4_mlp_gate_proj_weight_to_fp16, x = input_85_cast_fp16)[name = string("input_87_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1497_cast_fp16 = silu(x = input_87_cast_fp16)[name = string("op_1497_cast_fp16")];
+            string var_1503_pad_type_0 = const()[name = string("op_1503_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1503_strides_0 = const()[name = string("op_1503_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1503_pad_0 = const()[name = string("op_1503_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1503_dilations_0 = const()[name = string("op_1503_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1503_groups_0 = const()[name = string("op_1503_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_4_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_4_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56191936)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1503_cast_fp16 = conv(dilations = var_1503_dilations_0, groups = var_1503_groups_0, pad = var_1503_pad_0, pad_type = var_1503_pad_type_0, strides = var_1503_strides_0, weight = decoder_pre_transformer_layers_4_mlp_up_proj_weight_to_fp16, x = input_85_cast_fp16)[name = string("op_1503_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_89_cast_fp16 = mul(x = var_1497_cast_fp16, y = var_1503_cast_fp16)[name = string("input_89_cast_fp16")];
+            string x_31_pad_type_0 = const()[name = string("x_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_31_strides_0 = const()[name = string("x_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_31_pad_0 = const()[name = string("x_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_31_dilations_0 = const()[name = string("x_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_31_groups_0 = const()[name = string("x_31_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1514_weight_0_to_fp16 = const()[name = string("op_1514_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57240576)))];
+            tensor<fp16, [512]> var_1514_bias_0_to_fp16 = const()[name = string("op_1514_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58289216)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1514_cast_fp16 = conv(bias = var_1514_bias_0_to_fp16, dilations = x_31_dilations_0, groups = x_31_groups_0, pad = x_31_pad_0, pad_type = x_31_pad_type_0, strides = x_31_strides_0, weight = var_1514_weight_0_to_fp16, x = input_89_cast_fp16)[name = string("op_1514_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_21_cast_fp16 = add(x = inputs_19_cast_fp16, y = var_1514_cast_fp16)[name = string("inputs_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_21_cast_fp16 = mul(x = inputs_21_cast_fp16, y = inputs_21_cast_fp16)[name = string("inputs_sq_21_cast_fp16")];
+            tensor<int32, [1]> variance_21_axes_0 = const()[name = string("variance_21_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_21_keep_dims_0 = const()[name = string("variance_21_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_21_cast_fp16 = reduce_mean(axes = variance_21_axes_0, keep_dims = variance_21_keep_dims_0, x = inputs_sq_21_cast_fp16)[name = string("variance_21_cast_fp16")];
+            fp16 var_1530_to_fp16 = const()[name = string("op_1530_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1531_cast_fp16 = add(x = variance_21_cast_fp16, y = var_1530_to_fp16)[name = string("op_1531_cast_fp16")];
+            fp32 var_1532_epsilon_0 = const()[name = string("op_1532_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1532_cast_fp16 = rsqrt(epsilon = var_1532_epsilon_0, x = var_1531_cast_fp16)[name = string("op_1532_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_21_cast_fp16 = mul(x = inputs_21_cast_fp16, y = var_1532_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_21_to_fp16 = const()[name = string("w_21_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58290304)))];
+            tensor<fp16, [1, 512, 1, 1]> x_33_cast_fp16 = mul(x = w_21_to_fp16, y = hidden_states_21_cast_fp16)[name = string("x_33_cast_fp16")];
+            string q_41_pad_type_0 = const()[name = string("q_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_41_strides_0 = const()[name = string("q_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_41_pad_0 = const()[name = string("q_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_41_dilations_0 = const()[name = string("q_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_41_groups_0 = const()[name = string("q_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_5_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58291392)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_41_cast_fp16 = conv(dilations = q_41_dilations_0, groups = q_41_groups_0, pad = q_41_pad_0, pad_type = q_41_pad_type_0, strides = q_41_strides_0, weight = decoder_pre_transformer_layers_5_self_attn_q_proj_weight_to_fp16, x = x_33_cast_fp16)[name = string("q_41_cast_fp16")];
+            string k_21_pad_type_0 = const()[name = string("k_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_21_strides_0 = const()[name = string("k_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_21_pad_0 = const()[name = string("k_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_21_dilations_0 = const()[name = string("k_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_21_groups_0 = const()[name = string("k_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_5_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59340032)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_21_cast_fp16 = conv(dilations = k_21_dilations_0, groups = k_21_groups_0, pad = k_21_pad_0, pad_type = k_21_pad_type_0, strides = k_21_strides_0, weight = decoder_pre_transformer_layers_5_self_attn_k_proj_weight_to_fp16, x = x_33_cast_fp16)[name = string("k_21_cast_fp16")];
+            string v_11_pad_type_0 = const()[name = string("v_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_11_strides_0 = const()[name = string("v_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = string("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_11_dilations_0 = const()[name = string("v_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_11_groups_0 = const()[name = string("v_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_5_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60388672)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_11_cast_fp16 = conv(dilations = v_11_dilations_0, groups = v_11_groups_0, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = v_11_strides_0, weight = decoder_pre_transformer_layers_5_self_attn_v_proj_weight_to_fp16, x = x_33_cast_fp16)[name = string("v_11_cast_fp16")];
+            tensor<int32, [4]> var_1564 = const()[name = string("op_1564"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_43_cast_fp16 = reshape(shape = var_1564, x = q_41_cast_fp16)[name = string("q_43_cast_fp16")];
+            tensor<int32, [4]> var_1569 = const()[name = string("op_1569"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_23_cast_fp16 = reshape(shape = var_1569, x = k_21_cast_fp16)[name = string("k_23_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1581_cast_fp16 = mul(x = q_43_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1581_cast_fp16")];
+            tensor<int32, [4]> var_1586_begin_0 = const()[name = string("op_1586_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1586_end_0 = const()[name = string("op_1586_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1586_end_mask_0 = const()[name = string("op_1586_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1586_cast_fp16 = slice_by_index(begin = var_1586_begin_0, end = var_1586_end_0, end_mask = var_1586_end_mask_0, x = q_43_cast_fp16)[name = string("op_1586_cast_fp16")];
+            tensor<int32, [4]> var_1593_begin_0 = const()[name = string("op_1593_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1593_end_0 = const()[name = string("op_1593_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1593_end_mask_0 = const()[name = string("op_1593_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1593_cast_fp16 = slice_by_index(begin = var_1593_begin_0, end = var_1593_end_0, end_mask = var_1593_end_mask_0, x = q_43_cast_fp16)[name = string("op_1593_cast_fp16")];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1596_cast_fp16 = mul(x = var_1593_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_1596_cast_fp16")];
+            bool var_1598_interleave_0 = const()[name = string("op_1598_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1598_cast_fp16 = concat(axis = var_32, interleave = var_1598_interleave_0, values = (var_1596_cast_fp16, var_1586_cast_fp16))[name = string("op_1598_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1599_cast_fp16 = mul(x = var_1598_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1599_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_11_cast_fp16 = add(x = var_1581_cast_fp16, y = var_1599_cast_fp16)[name = string("q_rotated_11_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1601_cast_fp16 = mul(x = k_23_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1601_cast_fp16")];
+            tensor<int32, [4]> var_1606_begin_0 = const()[name = string("op_1606_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1606_end_0 = const()[name = string("op_1606_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1606_end_mask_0 = const()[name = string("op_1606_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1606_cast_fp16 = slice_by_index(begin = var_1606_begin_0, end = var_1606_end_0, end_mask = var_1606_end_mask_0, x = k_23_cast_fp16)[name = string("op_1606_cast_fp16")];
+            tensor<int32, [4]> var_1613_begin_0 = const()[name = string("op_1613_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1613_end_0 = const()[name = string("op_1613_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1613_end_mask_0 = const()[name = string("op_1613_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1613_cast_fp16 = slice_by_index(begin = var_1613_begin_0, end = var_1613_end_0, end_mask = var_1613_end_mask_0, x = k_23_cast_fp16)[name = string("op_1613_cast_fp16")];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1616_cast_fp16 = mul(x = var_1613_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_1616_cast_fp16")];
+            bool var_1618_interleave_0 = const()[name = string("op_1618_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1618_cast_fp16 = concat(axis = var_32, interleave = var_1618_interleave_0, values = (var_1616_cast_fp16, var_1606_cast_fp16))[name = string("op_1618_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1619_cast_fp16 = mul(x = var_1618_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1619_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_11_cast_fp16 = add(x = var_1601_cast_fp16, y = var_1619_cast_fp16)[name = string("k_rotated_11_cast_fp16")];
+            tensor<int32, [4]> var_1623 = const()[name = string("op_1623"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_11_cast_fp16 = reshape(shape = var_1623, x = k_rotated_11_cast_fp16)[name = string("current_key_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1628_cast_fp16 = mul(x = var_522_cast_fp16_5, y = var_647_cast_fp16)[name = string("op_1628_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1629_cast_fp16 = mul(x = current_key_11_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1629_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_11_cast_fp16 = add(x = var_1628_cast_fp16, y = var_1629_cast_fp16)[name = string("key_cache_updated_11_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1632_cast_fp16 = mul(x = var_531_cast_fp16_5, y = var_647_cast_fp16)[name = string("op_1632_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1633_cast_fp16 = mul(x = v_11_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1633_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_11_cast_fp16 = add(x = var_1632_cast_fp16, y = var_1633_cast_fp16)[name = string("value_cache_updated_11_cast_fp16")];
+            tensor<int32, [4]> var_1635 = const()[name = string("op_1635"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_47_cast_fp16 = reshape(shape = var_1635, x = q_rotated_11_cast_fp16)[name = string("q_47_cast_fp16")];
+            tensor<int32, [4]> var_1638 = const()[name = string("op_1638"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_11_cast_fp16 = reshape(shape = var_1638, x = key_cache_updated_11_cast_fp16)[name = string("k_for_attn_11_cast_fp16")];
+            tensor<int32, [4]> var_1640 = const()[name = string("op_1640"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_11_cast_fp16 = reshape(shape = var_1640, x = value_cache_updated_11_cast_fp16)[name = string("v_for_attn_11_cast_fp16")];
+            bool var_1644_transpose_x_1 = const()[name = string("op_1644_transpose_x_1"), val = bool(true)];
+            bool var_1644_transpose_y_1 = const()[name = string("op_1644_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1644_cast_fp16 = matmul(transpose_x = var_1644_transpose_x_1, transpose_y = var_1644_transpose_y_1, x = q_47_cast_fp16, y = k_for_attn_11_cast_fp16)[name = string("op_1644_cast_fp16")];
+            fp16 var_1645_to_fp16 = const()[name = string("op_1645_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_21_cast_fp16 = mul(x = var_1644_cast_fp16, y = var_1645_to_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_91_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_91_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_23_cast_fp16 = softmax(axis = var_28, x = input_91_cast_fp16)[name = string("attn_weights_23_cast_fp16")];
+            bool attn_output_21_transpose_x_1 = const()[name = string("attn_output_21_transpose_x_1"), val = bool(false)];
+            bool attn_output_21_transpose_y_1 = const()[name = string("attn_output_21_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_21_cast_fp16 = matmul(transpose_x = attn_output_21_transpose_x_1, transpose_y = attn_output_21_transpose_y_1, x = attn_weights_23_cast_fp16, y = v_for_attn_11_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [4]> var_1654 = const()[name = string("op_1654"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1656 = const()[name = string("op_1656"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_23_cast_fp16 = transpose(perm = var_1654, x = attn_output_21_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 1024, 1, 1]> input_93_cast_fp16 = reshape(shape = var_1656, x = attn_output_23_cast_fp16)[name = string("input_93_cast_fp16")];
+            string x_35_pad_type_0 = const()[name = string("x_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_35_strides_0 = const()[name = string("x_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_35_pad_0 = const()[name = string("x_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_35_dilations_0 = const()[name = string("x_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_35_groups_0 = const()[name = string("x_35_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1671_weight_0_to_fp16 = const()[name = string("op_1671_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61437312)))];
+            tensor<fp16, [512]> var_1671_bias_0_to_fp16 = const()[name = string("op_1671_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62485952)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1671_cast_fp16 = conv(bias = var_1671_bias_0_to_fp16, dilations = x_35_dilations_0, groups = x_35_groups_0, pad = x_35_pad_0, pad_type = x_35_pad_type_0, strides = x_35_strides_0, weight = var_1671_weight_0_to_fp16, x = input_93_cast_fp16)[name = string("op_1671_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_23_cast_fp16 = add(x = inputs_21_cast_fp16, y = var_1671_cast_fp16)[name = string("inputs_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_23_cast_fp16 = mul(x = inputs_23_cast_fp16, y = inputs_23_cast_fp16)[name = string("inputs_sq_23_cast_fp16")];
+            tensor<int32, [1]> variance_23_axes_0 = const()[name = string("variance_23_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_23_keep_dims_0 = const()[name = string("variance_23_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_23_cast_fp16 = reduce_mean(axes = variance_23_axes_0, keep_dims = variance_23_keep_dims_0, x = inputs_sq_23_cast_fp16)[name = string("variance_23_cast_fp16")];
+            fp16 var_1677_to_fp16 = const()[name = string("op_1677_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1678_cast_fp16 = add(x = variance_23_cast_fp16, y = var_1677_to_fp16)[name = string("op_1678_cast_fp16")];
+            fp32 var_1679_epsilon_0 = const()[name = string("op_1679_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1679_cast_fp16 = rsqrt(epsilon = var_1679_epsilon_0, x = var_1678_cast_fp16)[name = string("op_1679_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_23_cast_fp16 = mul(x = inputs_23_cast_fp16, y = var_1679_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_23_to_fp16 = const()[name = string("w_23_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62487040)))];
+            tensor<fp16, [1, 512, 1, 1]> input_95_cast_fp16 = mul(x = w_23_to_fp16, y = hidden_states_23_cast_fp16)[name = string("input_95_cast_fp16")];
+            string input_97_pad_type_0 = const()[name = string("input_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_97_strides_0 = const()[name = string("input_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_97_pad_0 = const()[name = string("input_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_97_dilations_0 = const()[name = string("input_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_97_groups_0 = const()[name = string("input_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_5_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(62488128)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_97_cast_fp16 = conv(dilations = input_97_dilations_0, groups = input_97_groups_0, pad = input_97_pad_0, pad_type = input_97_pad_type_0, strides = input_97_strides_0, weight = decoder_pre_transformer_layers_5_mlp_gate_proj_weight_to_fp16, x = input_95_cast_fp16)[name = string("input_97_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1693_cast_fp16 = silu(x = input_97_cast_fp16)[name = string("op_1693_cast_fp16")];
+            string var_1699_pad_type_0 = const()[name = string("op_1699_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1699_strides_0 = const()[name = string("op_1699_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1699_pad_0 = const()[name = string("op_1699_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1699_dilations_0 = const()[name = string("op_1699_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1699_groups_0 = const()[name = string("op_1699_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_5_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_5_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63536768)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1699_cast_fp16 = conv(dilations = var_1699_dilations_0, groups = var_1699_groups_0, pad = var_1699_pad_0, pad_type = var_1699_pad_type_0, strides = var_1699_strides_0, weight = decoder_pre_transformer_layers_5_mlp_up_proj_weight_to_fp16, x = input_95_cast_fp16)[name = string("op_1699_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_99_cast_fp16 = mul(x = var_1693_cast_fp16, y = var_1699_cast_fp16)[name = string("input_99_cast_fp16")];
+            string x_37_pad_type_0 = const()[name = string("x_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_37_strides_0 = const()[name = string("x_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_37_pad_0 = const()[name = string("x_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_37_dilations_0 = const()[name = string("x_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_37_groups_0 = const()[name = string("x_37_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1710_weight_0_to_fp16 = const()[name = string("op_1710_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64585408)))];
+            tensor<fp16, [512]> var_1710_bias_0_to_fp16 = const()[name = string("op_1710_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65634048)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1710_cast_fp16 = conv(bias = var_1710_bias_0_to_fp16, dilations = x_37_dilations_0, groups = x_37_groups_0, pad = x_37_pad_0, pad_type = x_37_pad_type_0, strides = x_37_strides_0, weight = var_1710_weight_0_to_fp16, x = input_99_cast_fp16)[name = string("op_1710_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_25_cast_fp16 = add(x = inputs_23_cast_fp16, y = var_1710_cast_fp16)[name = string("inputs_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_25_cast_fp16 = mul(x = inputs_25_cast_fp16, y = inputs_25_cast_fp16)[name = string("inputs_sq_25_cast_fp16")];
+            tensor<int32, [1]> variance_25_axes_0 = const()[name = string("variance_25_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_25_keep_dims_0 = const()[name = string("variance_25_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_25_cast_fp16 = reduce_mean(axes = variance_25_axes_0, keep_dims = variance_25_keep_dims_0, x = inputs_sq_25_cast_fp16)[name = string("variance_25_cast_fp16")];
+            fp16 var_1726_to_fp16 = const()[name = string("op_1726_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1727_cast_fp16 = add(x = variance_25_cast_fp16, y = var_1726_to_fp16)[name = string("op_1727_cast_fp16")];
+            fp32 var_1728_epsilon_0 = const()[name = string("op_1728_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1728_cast_fp16 = rsqrt(epsilon = var_1728_epsilon_0, x = var_1727_cast_fp16)[name = string("op_1728_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_25_cast_fp16 = mul(x = inputs_25_cast_fp16, y = var_1728_cast_fp16)[name = string("hidden_states_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_25_to_fp16 = const()[name = string("w_25_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65635136)))];
+            tensor<fp16, [1, 512, 1, 1]> x_39_cast_fp16 = mul(x = w_25_to_fp16, y = hidden_states_25_cast_fp16)[name = string("x_39_cast_fp16")];
+            string q_49_pad_type_0 = const()[name = string("q_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_49_strides_0 = const()[name = string("q_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_49_pad_0 = const()[name = string("q_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_49_dilations_0 = const()[name = string("q_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_49_groups_0 = const()[name = string("q_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_6_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65636224)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_49_cast_fp16 = conv(dilations = q_49_dilations_0, groups = q_49_groups_0, pad = q_49_pad_0, pad_type = q_49_pad_type_0, strides = q_49_strides_0, weight = decoder_pre_transformer_layers_6_self_attn_q_proj_weight_to_fp16, x = x_39_cast_fp16)[name = string("q_49_cast_fp16")];
+            string k_25_pad_type_0 = const()[name = string("k_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_25_strides_0 = const()[name = string("k_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_25_pad_0 = const()[name = string("k_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_25_dilations_0 = const()[name = string("k_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_25_groups_0 = const()[name = string("k_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_6_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66684864)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_25_cast_fp16 = conv(dilations = k_25_dilations_0, groups = k_25_groups_0, pad = k_25_pad_0, pad_type = k_25_pad_type_0, strides = k_25_strides_0, weight = decoder_pre_transformer_layers_6_self_attn_k_proj_weight_to_fp16, x = x_39_cast_fp16)[name = string("k_25_cast_fp16")];
+            string v_13_pad_type_0 = const()[name = string("v_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_13_strides_0 = const()[name = string("v_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_13_pad_0 = const()[name = string("v_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_13_dilations_0 = const()[name = string("v_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_13_groups_0 = const()[name = string("v_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_6_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67733504)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_13_cast_fp16 = conv(dilations = v_13_dilations_0, groups = v_13_groups_0, pad = v_13_pad_0, pad_type = v_13_pad_type_0, strides = v_13_strides_0, weight = decoder_pre_transformer_layers_6_self_attn_v_proj_weight_to_fp16, x = x_39_cast_fp16)[name = string("v_13_cast_fp16")];
+            tensor<int32, [4]> var_1760 = const()[name = string("op_1760"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_51_cast_fp16 = reshape(shape = var_1760, x = q_49_cast_fp16)[name = string("q_51_cast_fp16")];
+            tensor<int32, [4]> var_1765 = const()[name = string("op_1765"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_27_cast_fp16 = reshape(shape = var_1765, x = k_25_cast_fp16)[name = string("k_27_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1777_cast_fp16 = mul(x = q_51_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1777_cast_fp16")];
+            tensor<int32, [4]> var_1782_begin_0 = const()[name = string("op_1782_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1782_end_0 = const()[name = string("op_1782_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1782_end_mask_0 = const()[name = string("op_1782_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1782_cast_fp16 = slice_by_index(begin = var_1782_begin_0, end = var_1782_end_0, end_mask = var_1782_end_mask_0, x = q_51_cast_fp16)[name = string("op_1782_cast_fp16")];
+            tensor<int32, [4]> var_1789_begin_0 = const()[name = string("op_1789_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1789_end_0 = const()[name = string("op_1789_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1789_end_mask_0 = const()[name = string("op_1789_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1789_cast_fp16 = slice_by_index(begin = var_1789_begin_0, end = var_1789_end_0, end_mask = var_1789_end_mask_0, x = q_51_cast_fp16)[name = string("op_1789_cast_fp16")];
+            fp16 const_72_promoted_to_fp16 = const()[name = string("const_72_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1792_cast_fp16 = mul(x = var_1789_cast_fp16, y = const_72_promoted_to_fp16)[name = string("op_1792_cast_fp16")];
+            bool var_1794_interleave_0 = const()[name = string("op_1794_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1794_cast_fp16 = concat(axis = var_32, interleave = var_1794_interleave_0, values = (var_1792_cast_fp16, var_1782_cast_fp16))[name = string("op_1794_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1795_cast_fp16 = mul(x = var_1794_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1795_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_13_cast_fp16 = add(x = var_1777_cast_fp16, y = var_1795_cast_fp16)[name = string("q_rotated_13_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1797_cast_fp16 = mul(x = k_27_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1797_cast_fp16")];
+            tensor<int32, [4]> var_1802_begin_0 = const()[name = string("op_1802_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1802_end_0 = const()[name = string("op_1802_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1802_end_mask_0 = const()[name = string("op_1802_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1802_cast_fp16 = slice_by_index(begin = var_1802_begin_0, end = var_1802_end_0, end_mask = var_1802_end_mask_0, x = k_27_cast_fp16)[name = string("op_1802_cast_fp16")];
+            tensor<int32, [4]> var_1809_begin_0 = const()[name = string("op_1809_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1809_end_0 = const()[name = string("op_1809_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1809_end_mask_0 = const()[name = string("op_1809_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1809_cast_fp16 = slice_by_index(begin = var_1809_begin_0, end = var_1809_end_0, end_mask = var_1809_end_mask_0, x = k_27_cast_fp16)[name = string("op_1809_cast_fp16")];
+            fp16 const_75_promoted_to_fp16 = const()[name = string("const_75_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1812_cast_fp16 = mul(x = var_1809_cast_fp16, y = const_75_promoted_to_fp16)[name = string("op_1812_cast_fp16")];
+            bool var_1814_interleave_0 = const()[name = string("op_1814_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1814_cast_fp16 = concat(axis = var_32, interleave = var_1814_interleave_0, values = (var_1812_cast_fp16, var_1802_cast_fp16))[name = string("op_1814_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1815_cast_fp16 = mul(x = var_1814_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1815_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_13_cast_fp16 = add(x = var_1797_cast_fp16, y = var_1815_cast_fp16)[name = string("k_rotated_13_cast_fp16")];
+            tensor<int32, [4]> var_1819 = const()[name = string("op_1819"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_13_cast_fp16 = reshape(shape = var_1819, x = k_rotated_13_cast_fp16)[name = string("current_key_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1824_cast_fp16 = mul(x = var_522_cast_fp16_6, y = var_647_cast_fp16)[name = string("op_1824_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1825_cast_fp16 = mul(x = current_key_13_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1825_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_13_cast_fp16 = add(x = var_1824_cast_fp16, y = var_1825_cast_fp16)[name = string("key_cache_updated_13_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1828_cast_fp16 = mul(x = var_531_cast_fp16_6, y = var_647_cast_fp16)[name = string("op_1828_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_1829_cast_fp16 = mul(x = v_13_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_1829_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_13_cast_fp16 = add(x = var_1828_cast_fp16, y = var_1829_cast_fp16)[name = string("value_cache_updated_13_cast_fp16")];
+            tensor<int32, [4]> var_1831 = const()[name = string("op_1831"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_55_cast_fp16 = reshape(shape = var_1831, x = q_rotated_13_cast_fp16)[name = string("q_55_cast_fp16")];
+            tensor<int32, [4]> var_1834 = const()[name = string("op_1834"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_13_cast_fp16 = reshape(shape = var_1834, x = key_cache_updated_13_cast_fp16)[name = string("k_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> var_1836 = const()[name = string("op_1836"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_13_cast_fp16 = reshape(shape = var_1836, x = value_cache_updated_13_cast_fp16)[name = string("v_for_attn_13_cast_fp16")];
+            bool var_1840_transpose_x_1 = const()[name = string("op_1840_transpose_x_1"), val = bool(true)];
+            bool var_1840_transpose_y_1 = const()[name = string("op_1840_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_1840_cast_fp16 = matmul(transpose_x = var_1840_transpose_x_1, transpose_y = var_1840_transpose_y_1, x = q_55_cast_fp16, y = k_for_attn_13_cast_fp16)[name = string("op_1840_cast_fp16")];
+            fp16 var_1841_to_fp16 = const()[name = string("op_1841_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_25_cast_fp16 = mul(x = var_1840_cast_fp16, y = var_1841_to_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_101_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_27_cast_fp16 = softmax(axis = var_28, x = input_101_cast_fp16)[name = string("attn_weights_27_cast_fp16")];
+            bool attn_output_25_transpose_x_1 = const()[name = string("attn_output_25_transpose_x_1"), val = bool(false)];
+            bool attn_output_25_transpose_y_1 = const()[name = string("attn_output_25_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_1, transpose_y = attn_output_25_transpose_y_1, x = attn_weights_27_cast_fp16, y = v_for_attn_13_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_1850 = const()[name = string("op_1850"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_1852 = const()[name = string("op_1852"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_27_cast_fp16 = transpose(perm = var_1850, x = attn_output_25_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 1024, 1, 1]> input_103_cast_fp16 = reshape(shape = var_1852, x = attn_output_27_cast_fp16)[name = string("input_103_cast_fp16")];
+            string x_41_pad_type_0 = const()[name = string("x_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_41_strides_0 = const()[name = string("x_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_41_pad_0 = const()[name = string("x_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_41_dilations_0 = const()[name = string("x_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_41_groups_0 = const()[name = string("x_41_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1867_weight_0_to_fp16 = const()[name = string("op_1867_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68782144)))];
+            tensor<fp16, [512]> var_1867_bias_0_to_fp16 = const()[name = string("op_1867_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69830784)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1867_cast_fp16 = conv(bias = var_1867_bias_0_to_fp16, dilations = x_41_dilations_0, groups = x_41_groups_0, pad = x_41_pad_0, pad_type = x_41_pad_type_0, strides = x_41_strides_0, weight = var_1867_weight_0_to_fp16, x = input_103_cast_fp16)[name = string("op_1867_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_27_cast_fp16 = add(x = inputs_25_cast_fp16, y = var_1867_cast_fp16)[name = string("inputs_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_27_cast_fp16 = mul(x = inputs_27_cast_fp16, y = inputs_27_cast_fp16)[name = string("inputs_sq_27_cast_fp16")];
+            tensor<int32, [1]> variance_27_axes_0 = const()[name = string("variance_27_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_27_keep_dims_0 = const()[name = string("variance_27_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_27_cast_fp16 = reduce_mean(axes = variance_27_axes_0, keep_dims = variance_27_keep_dims_0, x = inputs_sq_27_cast_fp16)[name = string("variance_27_cast_fp16")];
+            fp16 var_1873_to_fp16 = const()[name = string("op_1873_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1874_cast_fp16 = add(x = variance_27_cast_fp16, y = var_1873_to_fp16)[name = string("op_1874_cast_fp16")];
+            fp32 var_1875_epsilon_0 = const()[name = string("op_1875_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1875_cast_fp16 = rsqrt(epsilon = var_1875_epsilon_0, x = var_1874_cast_fp16)[name = string("op_1875_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_27_cast_fp16 = mul(x = inputs_27_cast_fp16, y = var_1875_cast_fp16)[name = string("hidden_states_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_27_to_fp16 = const()[name = string("w_27_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69831872)))];
+            tensor<fp16, [1, 512, 1, 1]> input_105_cast_fp16 = mul(x = w_27_to_fp16, y = hidden_states_27_cast_fp16)[name = string("input_105_cast_fp16")];
+            string input_107_pad_type_0 = const()[name = string("input_107_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_107_strides_0 = const()[name = string("input_107_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_107_pad_0 = const()[name = string("input_107_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_107_dilations_0 = const()[name = string("input_107_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_107_groups_0 = const()[name = string("input_107_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_6_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69832960)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_107_cast_fp16 = conv(dilations = input_107_dilations_0, groups = input_107_groups_0, pad = input_107_pad_0, pad_type = input_107_pad_type_0, strides = input_107_strides_0, weight = decoder_pre_transformer_layers_6_mlp_gate_proj_weight_to_fp16, x = input_105_cast_fp16)[name = string("input_107_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_1889_cast_fp16 = silu(x = input_107_cast_fp16)[name = string("op_1889_cast_fp16")];
+            string var_1895_pad_type_0 = const()[name = string("op_1895_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1895_strides_0 = const()[name = string("op_1895_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1895_pad_0 = const()[name = string("op_1895_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1895_dilations_0 = const()[name = string("op_1895_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1895_groups_0 = const()[name = string("op_1895_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_6_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_6_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(70881600)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1895_cast_fp16 = conv(dilations = var_1895_dilations_0, groups = var_1895_groups_0, pad = var_1895_pad_0, pad_type = var_1895_pad_type_0, strides = var_1895_strides_0, weight = decoder_pre_transformer_layers_6_mlp_up_proj_weight_to_fp16, x = input_105_cast_fp16)[name = string("op_1895_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_109_cast_fp16 = mul(x = var_1889_cast_fp16, y = var_1895_cast_fp16)[name = string("input_109_cast_fp16")];
+            string x_43_pad_type_0 = const()[name = string("x_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_43_strides_0 = const()[name = string("x_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_43_pad_0 = const()[name = string("x_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_43_dilations_0 = const()[name = string("x_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_43_groups_0 = const()[name = string("x_43_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_1906_weight_0_to_fp16 = const()[name = string("op_1906_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71930240)))];
+            tensor<fp16, [512]> var_1906_bias_0_to_fp16 = const()[name = string("op_1906_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72978880)))];
+            tensor<fp16, [1, 512, 1, 1]> var_1906_cast_fp16 = conv(bias = var_1906_bias_0_to_fp16, dilations = x_43_dilations_0, groups = x_43_groups_0, pad = x_43_pad_0, pad_type = x_43_pad_type_0, strides = x_43_strides_0, weight = var_1906_weight_0_to_fp16, x = input_109_cast_fp16)[name = string("op_1906_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_29_cast_fp16 = add(x = inputs_27_cast_fp16, y = var_1906_cast_fp16)[name = string("inputs_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_29_cast_fp16 = mul(x = inputs_29_cast_fp16, y = inputs_29_cast_fp16)[name = string("inputs_sq_29_cast_fp16")];
+            tensor<int32, [1]> variance_29_axes_0 = const()[name = string("variance_29_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_29_keep_dims_0 = const()[name = string("variance_29_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_29_cast_fp16 = reduce_mean(axes = variance_29_axes_0, keep_dims = variance_29_keep_dims_0, x = inputs_sq_29_cast_fp16)[name = string("variance_29_cast_fp16")];
+            fp16 var_1922_to_fp16 = const()[name = string("op_1922_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_1923_cast_fp16 = add(x = variance_29_cast_fp16, y = var_1922_to_fp16)[name = string("op_1923_cast_fp16")];
+            fp32 var_1924_epsilon_0 = const()[name = string("op_1924_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_1924_cast_fp16 = rsqrt(epsilon = var_1924_epsilon_0, x = var_1923_cast_fp16)[name = string("op_1924_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_29_cast_fp16 = mul(x = inputs_29_cast_fp16, y = var_1924_cast_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_29_to_fp16 = const()[name = string("w_29_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72979968)))];
+            tensor<fp16, [1, 512, 1, 1]> x_45_cast_fp16 = mul(x = w_29_to_fp16, y = hidden_states_29_cast_fp16)[name = string("x_45_cast_fp16")];
+            string q_57_pad_type_0 = const()[name = string("q_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_57_strides_0 = const()[name = string("q_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_57_pad_0 = const()[name = string("q_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_57_dilations_0 = const()[name = string("q_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_57_groups_0 = const()[name = string("q_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_self_attn_q_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_7_self_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(72981056)))];
+            tensor<fp16, [1, 1024, 1, 1]> q_57_cast_fp16 = conv(dilations = q_57_dilations_0, groups = q_57_groups_0, pad = q_57_pad_0, pad_type = q_57_pad_type_0, strides = q_57_strides_0, weight = decoder_pre_transformer_layers_7_self_attn_q_proj_weight_to_fp16, x = x_45_cast_fp16)[name = string("q_57_cast_fp16")];
+            string k_29_pad_type_0 = const()[name = string("k_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_29_strides_0 = const()[name = string("k_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_29_pad_0 = const()[name = string("k_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_29_dilations_0 = const()[name = string("k_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_29_groups_0 = const()[name = string("k_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_self_attn_k_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_7_self_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74029696)))];
+            tensor<fp16, [1, 1024, 1, 1]> k_29_cast_fp16 = conv(dilations = k_29_dilations_0, groups = k_29_groups_0, pad = k_29_pad_0, pad_type = k_29_pad_type_0, strides = k_29_strides_0, weight = decoder_pre_transformer_layers_7_self_attn_k_proj_weight_to_fp16, x = x_45_cast_fp16)[name = string("k_29_cast_fp16")];
+            string v_pad_type_0 = const()[name = string("v_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_strides_0 = const()[name = string("v_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_pad_0 = const()[name = string("v_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_dilations_0 = const()[name = string("v_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_groups_0 = const()[name = string("v_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_self_attn_v_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_7_self_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75078336)))];
+            tensor<fp16, [1, 1024, 1, 1]> v_cast_fp16 = conv(dilations = v_dilations_0, groups = v_groups_0, pad = v_pad_0, pad_type = v_pad_type_0, strides = v_strides_0, weight = decoder_pre_transformer_layers_7_self_attn_v_proj_weight_to_fp16, x = x_45_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<int32, [4]> var_1956 = const()[name = string("op_1956"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> q_59_cast_fp16 = reshape(shape = var_1956, x = q_57_cast_fp16)[name = string("q_59_cast_fp16")];
+            tensor<int32, [4]> var_1961 = const()[name = string("op_1961"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<fp16, [16, 64, 1, 1]> k_cast_fp16 = reshape(shape = var_1961, x = k_29_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1973_cast_fp16 = mul(x = q_59_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1973_cast_fp16")];
+            tensor<int32, [4]> var_1978_begin_0 = const()[name = string("op_1978_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1978_end_0 = const()[name = string("op_1978_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1978_end_mask_0 = const()[name = string("op_1978_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1978_cast_fp16 = slice_by_index(begin = var_1978_begin_0, end = var_1978_end_0, end_mask = var_1978_end_mask_0, x = q_59_cast_fp16)[name = string("op_1978_cast_fp16")];
+            tensor<int32, [4]> var_1985_begin_0 = const()[name = string("op_1985_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_1985_end_0 = const()[name = string("op_1985_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_1985_end_mask_0 = const()[name = string("op_1985_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1985_cast_fp16 = slice_by_index(begin = var_1985_begin_0, end = var_1985_end_0, end_mask = var_1985_end_mask_0, x = q_59_cast_fp16)[name = string("op_1985_cast_fp16")];
+            fp16 const_80_promoted_to_fp16 = const()[name = string("const_80_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_1988_cast_fp16 = mul(x = var_1985_cast_fp16, y = const_80_promoted_to_fp16)[name = string("op_1988_cast_fp16")];
+            bool var_1990_interleave_0 = const()[name = string("op_1990_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_1990_cast_fp16 = concat(axis = var_32, interleave = var_1990_interleave_0, values = (var_1988_cast_fp16, var_1978_cast_fp16))[name = string("op_1990_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1991_cast_fp16 = mul(x = var_1990_cast_fp16, y = sin_1_cast_fp16)[name = string("op_1991_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> q_rotated_cast_fp16 = add(x = var_1973_cast_fp16, y = var_1991_cast_fp16)[name = string("q_rotated_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_1993_cast_fp16 = mul(x = k_cast_fp16, y = cos_1_cast_fp16)[name = string("op_1993_cast_fp16")];
+            tensor<int32, [4]> var_1998_begin_0 = const()[name = string("op_1998_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1998_end_0 = const()[name = string("op_1998_end_0"), val = tensor<int32, [4]>([16, 32, 1, 1])];
+            tensor<bool, [4]> var_1998_end_mask_0 = const()[name = string("op_1998_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_1998_cast_fp16 = slice_by_index(begin = var_1998_begin_0, end = var_1998_end_0, end_mask = var_1998_end_mask_0, x = k_cast_fp16)[name = string("op_1998_cast_fp16")];
+            tensor<int32, [4]> var_2005_begin_0 = const()[name = string("op_2005_begin_0"), val = tensor<int32, [4]>([0, 32, 0, 0])];
+            tensor<int32, [4]> var_2005_end_0 = const()[name = string("op_2005_end_0"), val = tensor<int32, [4]>([16, 64, 1, 1])];
+            tensor<bool, [4]> var_2005_end_mask_0 = const()[name = string("op_2005_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [16, 32, 1, 1]> var_2005_cast_fp16 = slice_by_index(begin = var_2005_begin_0, end = var_2005_end_0, end_mask = var_2005_end_mask_0, x = k_cast_fp16)[name = string("op_2005_cast_fp16")];
+            fp16 const_83_promoted_to_fp16 = const()[name = string("const_83_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [16, 32, 1, 1]> var_2008_cast_fp16 = mul(x = var_2005_cast_fp16, y = const_83_promoted_to_fp16)[name = string("op_2008_cast_fp16")];
+            bool var_2010_interleave_0 = const()[name = string("op_2010_interleave_0"), val = bool(false)];
+            tensor<fp16, [16, 64, 1, 1]> var_2010_cast_fp16 = concat(axis = var_32, interleave = var_2010_interleave_0, values = (var_2008_cast_fp16, var_1998_cast_fp16))[name = string("op_2010_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> var_2011_cast_fp16 = mul(x = var_2010_cast_fp16, y = sin_1_cast_fp16)[name = string("op_2011_cast_fp16")];
+            tensor<fp16, [16, 64, 1, 1]> k_rotated_cast_fp16 = add(x = var_1993_cast_fp16, y = var_2011_cast_fp16)[name = string("k_rotated_cast_fp16")];
+            tensor<int32, [4]> var_2015 = const()[name = string("op_2015"), val = tensor<int32, [4]>([1, 1024, 1, 1])];
+            tensor<fp16, [1, 1024, 1, 1]> current_key_cast_fp16 = reshape(shape = var_2015, x = k_rotated_cast_fp16)[name = string("current_key_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2020_cast_fp16 = mul(x = var_522_cast_fp16_7, y = var_647_cast_fp16)[name = string("op_2020_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2021_cast_fp16 = mul(x = current_key_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_2021_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> key_cache_updated_cast_fp16 = add(x = var_2020_cast_fp16, y = var_2021_cast_fp16)[name = string("key_cache_updated_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2024_cast_fp16 = mul(x = var_531_cast_fp16_7, y = var_647_cast_fp16)[name = string("op_2024_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> var_2025_cast_fp16 = mul(x = v_cast_fp16, y = update_mask_1_cast_fp16)[name = string("op_2025_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 256]> value_cache_updated_cast_fp16 = add(x = var_2024_cast_fp16, y = var_2025_cast_fp16)[name = string("value_cache_updated_cast_fp16")];
+            tensor<int32, [4]> var_2027 = const()[name = string("op_2027"), val = tensor<int32, [4]>([1, 16, 64, 1])];
+            tensor<fp16, [1, 16, 64, 1]> q_cast_fp16 = reshape(shape = var_2027, x = q_rotated_cast_fp16)[name = string("q_cast_fp16")];
+            tensor<int32, [4]> var_2030 = const()[name = string("op_2030"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> k_for_attn_cast_fp16 = reshape(shape = var_2030, x = key_cache_updated_cast_fp16)[name = string("k_for_attn_cast_fp16")];
+            tensor<int32, [4]> var_2032 = const()[name = string("op_2032"), val = tensor<int32, [4]>([1, 16, 64, 256])];
+            tensor<fp16, [1, 16, 64, 256]> v_for_attn_cast_fp16 = reshape(shape = var_2032, x = value_cache_updated_cast_fp16)[name = string("v_for_attn_cast_fp16")];
+            bool var_2036_transpose_x_1 = const()[name = string("op_2036_transpose_x_1"), val = bool(true)];
+            bool var_2036_transpose_y_1 = const()[name = string("op_2036_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 16, 1, 256]> var_2036_cast_fp16 = matmul(transpose_x = var_2036_transpose_x_1, transpose_y = var_2036_transpose_y_1, x = q_cast_fp16, y = k_for_attn_cast_fp16)[name = string("op_2036_cast_fp16")];
+            fp16 var_2037_to_fp16 = const()[name = string("op_2037_to_fp16"), val = fp16(0x1p-3)];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_29_cast_fp16 = mul(x = var_2036_cast_fp16, y = var_2037_to_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> input_111_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = attn_mask_1_cast_fp16)[name = string("input_111_cast_fp16")];
+            tensor<fp16, [1, 16, 1, 256]> attn_weights_cast_fp16 = softmax(axis = var_28, x = input_111_cast_fp16)[name = string("attn_weights_cast_fp16")];
+            bool attn_output_29_transpose_x_1 = const()[name = string("attn_output_29_transpose_x_1"), val = bool(false)];
+            bool attn_output_29_transpose_y_1 = const()[name = string("attn_output_29_transpose_y_1"), val = bool(true)];
+            tensor<fp16, [1, 16, 1, 64]> attn_output_29_cast_fp16 = matmul(transpose_x = attn_output_29_transpose_x_1, transpose_y = attn_output_29_transpose_y_1, x = attn_weights_cast_fp16, y = v_for_attn_cast_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<int32, [4]> var_2046 = const()[name = string("op_2046"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [4]> var_2048 = const()[name = string("op_2048"), val = tensor<int32, [4]>([1, -1, 1, 1])];
+            tensor<fp16, [1, 16, 64, 1]> attn_output_cast_fp16 = transpose(perm = var_2046, x = attn_output_29_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 1024, 1, 1]> input_113_cast_fp16 = reshape(shape = var_2048, x = attn_output_cast_fp16)[name = string("input_113_cast_fp16")];
+            string x_47_pad_type_0 = const()[name = string("x_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_47_strides_0 = const()[name = string("x_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_47_pad_0 = const()[name = string("x_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_47_dilations_0 = const()[name = string("x_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_47_groups_0 = const()[name = string("x_47_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_2063_weight_0_to_fp16 = const()[name = string("op_2063_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(76126976)))];
+            tensor<fp16, [512]> var_2063_bias_0_to_fp16 = const()[name = string("op_2063_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77175616)))];
+            tensor<fp16, [1, 512, 1, 1]> var_2063_cast_fp16 = conv(bias = var_2063_bias_0_to_fp16, dilations = x_47_dilations_0, groups = x_47_groups_0, pad = x_47_pad_0, pad_type = x_47_pad_type_0, strides = x_47_strides_0, weight = var_2063_weight_0_to_fp16, x = input_113_cast_fp16)[name = string("op_2063_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_31_cast_fp16 = add(x = inputs_29_cast_fp16, y = var_2063_cast_fp16)[name = string("inputs_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_31_cast_fp16 = mul(x = inputs_31_cast_fp16, y = inputs_31_cast_fp16)[name = string("inputs_sq_31_cast_fp16")];
+            tensor<int32, [1]> variance_31_axes_0 = const()[name = string("variance_31_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_31_keep_dims_0 = const()[name = string("variance_31_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_31_cast_fp16 = reduce_mean(axes = variance_31_axes_0, keep_dims = variance_31_keep_dims_0, x = inputs_sq_31_cast_fp16)[name = string("variance_31_cast_fp16")];
+            fp16 var_2069_to_fp16 = const()[name = string("op_2069_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_2070_cast_fp16 = add(x = variance_31_cast_fp16, y = var_2069_to_fp16)[name = string("op_2070_cast_fp16")];
+            fp32 var_2071_epsilon_0 = const()[name = string("op_2071_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2071_cast_fp16 = rsqrt(epsilon = var_2071_epsilon_0, x = var_2070_cast_fp16)[name = string("op_2071_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_31_cast_fp16 = mul(x = inputs_31_cast_fp16, y = var_2071_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_31_to_fp16 = const()[name = string("w_31_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77176704)))];
+            tensor<fp16, [1, 512, 1, 1]> input_115_cast_fp16 = mul(x = w_31_to_fp16, y = hidden_states_31_cast_fp16)[name = string("input_115_cast_fp16")];
+            string input_117_pad_type_0 = const()[name = string("input_117_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_117_strides_0 = const()[name = string("input_117_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_117_pad_0 = const()[name = string("input_117_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_117_dilations_0 = const()[name = string("input_117_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_117_groups_0 = const()[name = string("input_117_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_mlp_gate_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_7_mlp_gate_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77177792)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_117_cast_fp16 = conv(dilations = input_117_dilations_0, groups = input_117_groups_0, pad = input_117_pad_0, pad_type = input_117_pad_type_0, strides = input_117_strides_0, weight = decoder_pre_transformer_layers_7_mlp_gate_proj_weight_to_fp16, x = input_115_cast_fp16)[name = string("input_117_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> var_2085_cast_fp16 = silu(x = input_117_cast_fp16)[name = string("op_2085_cast_fp16")];
+            string var_2091_pad_type_0 = const()[name = string("op_2091_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2091_strides_0 = const()[name = string("op_2091_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2091_pad_0 = const()[name = string("op_2091_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2091_dilations_0 = const()[name = string("op_2091_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2091_groups_0 = const()[name = string("op_2091_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_layers_7_mlp_up_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_layers_7_mlp_up_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78226432)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2091_cast_fp16 = conv(dilations = var_2091_dilations_0, groups = var_2091_groups_0, pad = var_2091_pad_0, pad_type = var_2091_pad_type_0, strides = var_2091_strides_0, weight = decoder_pre_transformer_layers_7_mlp_up_proj_weight_to_fp16, x = input_115_cast_fp16)[name = string("op_2091_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 1]> input_119_cast_fp16 = mul(x = var_2085_cast_fp16, y = var_2091_cast_fp16)[name = string("input_119_cast_fp16")];
+            string x_49_pad_type_0 = const()[name = string("x_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_49_strides_0 = const()[name = string("x_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_49_pad_0 = const()[name = string("x_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_49_dilations_0 = const()[name = string("x_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_49_groups_0 = const()[name = string("x_49_groups_0"), val = int32(1)];
+            tensor<fp16, [512, 1024, 1, 1]> var_2102_weight_0_to_fp16 = const()[name = string("op_2102_weight_0_to_fp16"), val = tensor<fp16, [512, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79275072)))];
+            tensor<fp16, [512]> var_2102_bias_0_to_fp16 = const()[name = string("op_2102_bias_0_to_fp16"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80323712)))];
+            tensor<fp16, [1, 512, 1, 1]> var_2102_cast_fp16 = conv(bias = var_2102_bias_0_to_fp16, dilations = x_49_dilations_0, groups = x_49_groups_0, pad = x_49_pad_0, pad_type = x_49_pad_type_0, strides = x_49_strides_0, weight = var_2102_weight_0_to_fp16, x = input_119_cast_fp16)[name = string("op_2102_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_cast_fp16 = add(x = inputs_31_cast_fp16, y = var_2102_cast_fp16)[name = string("inputs_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> inputs_sq_cast_fp16 = mul(x = inputs_cast_fp16, y = inputs_cast_fp16)[name = string("inputs_sq_cast_fp16")];
+            tensor<int32, [1]> variance_axes_0 = const()[name = string("variance_axes_0"), val = tensor<int32, [1]>([1])];
+            bool variance_keep_dims_0 = const()[name = string("variance_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 1, 1]> variance_cast_fp16 = reduce_mean(axes = variance_axes_0, keep_dims = variance_keep_dims_0, x = inputs_sq_cast_fp16)[name = string("variance_cast_fp16")];
+            fp16 var_2112_to_fp16 = const()[name = string("op_2112_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 1, 1, 1]> var_2113_cast_fp16 = add(x = variance_cast_fp16, y = var_2112_to_fp16)[name = string("op_2113_cast_fp16")];
+            fp32 var_2114_epsilon_0 = const()[name = string("op_2114_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 1, 1, 1]> var_2114_cast_fp16 = rsqrt(epsilon = var_2114_epsilon_0, x = var_2113_cast_fp16)[name = string("op_2114_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> hidden_states_cast_fp16 = mul(x = inputs_cast_fp16, y = var_2114_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 1]> w_to_fp16 = const()[name = string("w_to_fp16"), val = tensor<fp16, [1, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80324800)))];
+            tensor<fp16, [1, 512, 1, 1]> input_121_cast_fp16 = mul(x = w_to_fp16, y = hidden_states_cast_fp16)[name = string("input_121_cast_fp16")];
+            string new_hidden_pad_type_0 = const()[name = string("new_hidden_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> new_hidden_strides_0 = const()[name = string("new_hidden_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> new_hidden_pad_0 = const()[name = string("new_hidden_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> new_hidden_dilations_0 = const()[name = string("new_hidden_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 new_hidden_groups_0 = const()[name = string("new_hidden_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 512, 1, 1]> decoder_pre_transformer_output_proj_weight_to_fp16 = const()[name = string("decoder_pre_transformer_output_proj_weight_to_fp16"), val = tensor<fp16, [1024, 512, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80325888)))];
+            tensor<fp16, [1024]> decoder_pre_transformer_output_proj_bias_to_fp16 = const()[name = string("decoder_pre_transformer_output_proj_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81374528)))];
+            tensor<fp16, [1, 1024, 1, 1]> hidden_context_update = conv(bias = decoder_pre_transformer_output_proj_bias_to_fp16, dilations = new_hidden_dilations_0, groups = new_hidden_groups_0, pad = new_hidden_pad_0, pad_type = new_hidden_pad_type_0, strides = new_hidden_strides_0, weight = decoder_pre_transformer_output_proj_weight_to_fp16, x = input_121_cast_fp16)[name = string("new_hidden_cast_fp16")];
+            bool var_2127_interleave_0 = const()[name = string("op_2127_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8192, 1, 1]> key_cache_updates = concat(axis = var_32, interleave = var_2127_interleave_0, values = (current_key_1_cast_fp16, current_key_3_cast_fp16, current_key_5_cast_fp16, current_key_7_cast_fp16, current_key_9_cast_fp16, current_key_11_cast_fp16, current_key_13_cast_fp16, current_key_cast_fp16))[name = string("op_2127_cast_fp16")];
+            bool var_2129_interleave_0 = const()[name = string("op_2129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8192, 1, 1]> value_cache_updates = concat(axis = var_32, interleave = var_2129_interleave_0, values = (v_1_cast_fp16, v_3_cast_fp16, v_5_cast_fp16, v_7_cast_fp16, v_9_cast_fp16, v_11_cast_fp16, v_13_cast_fp16, v_cast_fp16))[name = string("op_2129_cast_fp16")];
+            bool x_51_interleave_0 = const()[name = string("x_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1024, 1, 5]> x_51_cast_fp16 = concat(axis = var_28, interleave = x_51_interleave_0, values = (hidden_context, hidden_context_update))[name = string("x_51_cast_fp16")];
+            string x_53_pad_type_0 = const()[name = string("x_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_53_strides_0 = const()[name = string("x_53_strides_0"), val = tensor<int32, [2]>([1, 2])];
+            tensor<int32, [4]> x_53_pad_0 = const()[name = string("x_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_53_dilations_0 = const()[name = string("x_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_53_groups_0 = const()[name = string("x_53_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_53_has_output_shape_output_shape_0 = const()[name = string("x_53_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 1024, 1, 10])];
+            tensor<fp16, [1024, 1024, 1, 2]> decoder_upsample_0_0_conv_weight_to_fp16 = const()[name = string("decoder_upsample_0_0_conv_weight_to_fp16"), val = tensor<fp16, [1024, 1024, 1, 2]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81376640)))];
+            tensor<fp16, [1024]> decoder_upsample_0_0_conv_bias_to_fp16 = const()[name = string("decoder_upsample_0_0_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85571008)))];
+            tensor<fp16, [1, 1024, 1, 10]> x_53_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_upsample_0_0_conv_bias_to_fp16, dilations = x_53_dilations_0, groups = x_53_groups_0, output_shape = x_53_has_output_shape_output_shape_0, pad = x_53_pad_0, pad_type = x_53_pad_type_0, strides = x_53_strides_0, weight = decoder_upsample_0_0_conv_weight_to_fp16, x = x_51_cast_fp16)[name = string("x_53_has_output_shape_cast_fp16")];
+            tensor<int32, [8]> input_123_pad_0 = const()[name = string("input_123_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_123_mode_0 = const()[name = string("input_123_mode_0"), val = string("constant")];
+            fp16 const_86_to_fp16 = const()[name = string("const_86_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 1, 16]> input_123_cast_fp16 = pad(constant_val = const_86_to_fp16, mode = input_123_mode_0, pad = input_123_pad_0, x = x_53_has_output_shape_cast_fp16)[name = string("input_123_cast_fp16")];
+            string x_57_pad_type_0 = const()[name = string("x_57_pad_type_0"), val = string("valid")];
+            int32 x_57_groups_0 = const()[name = string("x_57_groups_0"), val = int32(1024)];
+            tensor<int32, [2]> x_57_strides_0 = const()[name = string("x_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_57_pad_0 = const()[name = string("x_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_57_dilations_0 = const()[name = string("x_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1024, 1, 1, 7]> decoder_upsample_0_1_dwconv_conv_weight_to_fp16 = const()[name = string("decoder_upsample_0_1_dwconv_conv_weight_to_fp16"), val = tensor<fp16, [1024, 1, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85573120)))];
+            tensor<fp16, [1024]> decoder_upsample_0_1_dwconv_conv_bias_to_fp16 = const()[name = string("decoder_upsample_0_1_dwconv_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85587520)))];
+            tensor<fp16, [1, 1024, 1, 10]> x_57_cast_fp16 = conv(bias = decoder_upsample_0_1_dwconv_conv_bias_to_fp16, dilations = x_57_dilations_0, groups = x_57_groups_0, pad = x_57_pad_0, pad_type = x_57_pad_type_0, strides = x_57_strides_0, weight = decoder_upsample_0_1_dwconv_conv_weight_to_fp16, x = input_123_cast_fp16)[name = string("x_57_cast_fp16")];
+            tensor<int32, [1]> var_2169_axes_0 = const()[name = string("op_2169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 10]> var_2169_cast_fp16 = squeeze(axes = var_2169_axes_0, x = x_57_cast_fp16)[name = string("op_2169_cast_fp16")];
+            tensor<int32, [3]> var_2170 = const()[name = string("op_2170"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> x_59_axes_0 = const()[name = string("x_59_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1024]> decoder_upsample_0_1_norm_weight_to_fp16 = const()[name = string("decoder_upsample_0_1_norm_weight_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85589632)))];
+            tensor<fp16, [1024]> decoder_upsample_0_1_norm_bias_to_fp16 = const()[name = string("decoder_upsample_0_1_norm_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85591744)))];
+            fp16 var_17_to_fp16 = const()[name = string("op_17_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 10, 1024]> input_125_cast_fp16 = transpose(perm = var_2170, x = var_2169_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 10, 1024]> x_59_cast_fp16 = layer_norm(axes = x_59_axes_0, beta = decoder_upsample_0_1_norm_bias_to_fp16, epsilon = var_17_to_fp16, gamma = decoder_upsample_0_1_norm_weight_to_fp16, x = input_125_cast_fp16)[name = string("x_59_cast_fp16")];
+            tensor<int32, [3]> var_2176 = const()[name = string("op_2176"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_127_axes_0 = const()[name = string("input_127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 10]> var_2177_cast_fp16 = transpose(perm = var_2176, x = x_59_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 1024, 1, 10]> input_127_cast_fp16 = expand_dims(axes = input_127_axes_0, x = var_2177_cast_fp16)[name = string("input_127_cast_fp16")];
+            string input_129_pad_type_0 = const()[name = string("input_129_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_129_strides_0 = const()[name = string("input_129_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_129_pad_0 = const()[name = string("input_129_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_129_dilations_0 = const()[name = string("input_129_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_129_groups_0 = const()[name = string("input_129_groups_0"), val = int32(1)];
+            tensor<fp16, [4096, 1024, 1, 1]> decoder_upsample_0_1_pwconv1_weight_to_fp16 = const()[name = string("decoder_upsample_0_1_pwconv1_weight_to_fp16"), val = tensor<fp16, [4096, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85593856)))];
+            tensor<fp16, [4096]> decoder_upsample_0_1_pwconv1_bias_to_fp16 = const()[name = string("decoder_upsample_0_1_pwconv1_bias_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93982528)))];
+            tensor<fp16, [1, 4096, 1, 10]> input_129_cast_fp16 = conv(bias = decoder_upsample_0_1_pwconv1_bias_to_fp16, dilations = input_129_dilations_0, groups = input_129_groups_0, pad = input_129_pad_0, pad_type = input_129_pad_type_0, strides = input_129_strides_0, weight = decoder_upsample_0_1_pwconv1_weight_to_fp16, x = input_127_cast_fp16)[name = string("input_129_cast_fp16")];
+            string input_131_mode_0 = const()[name = string("input_131_mode_0"), val = string("EXACT")];
+            tensor<fp16, [1, 4096, 1, 10]> input_131_cast_fp16 = gelu(mode = input_131_mode_0, x = input_129_cast_fp16)[name = string("input_131_cast_fp16")];
+            string x_61_pad_type_0 = const()[name = string("x_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_61_strides_0 = const()[name = string("x_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_61_pad_0 = const()[name = string("x_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_61_dilations_0 = const()[name = string("x_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_61_groups_0 = const()[name = string("x_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 4096, 1, 1]> x_63_weight_0_to_fp16 = const()[name = string("x_63_weight_0_to_fp16"), val = tensor<fp16, [1024, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93990784)))];
+            tensor<fp16, [1024]> x_63_bias_0_to_fp16 = const()[name = string("x_63_bias_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102379456)))];
+            tensor<fp16, [1, 1024, 1, 10]> x_63_cast_fp16 = conv(bias = x_63_bias_0_to_fp16, dilations = x_61_dilations_0, groups = x_61_groups_0, pad = x_61_pad_0, pad_type = x_61_pad_type_0, strides = x_61_strides_0, weight = x_63_weight_0_to_fp16, x = input_131_cast_fp16)[name = string("x_63_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 10]> x_65_cast_fp16 = add(x = x_53_has_output_shape_cast_fp16, y = x_63_cast_fp16)[name = string("x_65_cast_fp16")];
+            string x_67_pad_type_0 = const()[name = string("x_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_67_strides_0 = const()[name = string("x_67_strides_0"), val = tensor<int32, [2]>([1, 2])];
+            tensor<int32, [4]> x_67_pad_0 = const()[name = string("x_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_67_dilations_0 = const()[name = string("x_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_67_groups_0 = const()[name = string("x_67_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_67_has_output_shape_output_shape_0 = const()[name = string("x_67_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 1024, 1, 20])];
+            tensor<fp16, [1024, 1024, 1, 2]> decoder_upsample_1_0_conv_weight_to_fp16 = const()[name = string("decoder_upsample_1_0_conv_weight_to_fp16"), val = tensor<fp16, [1024, 1024, 1, 2]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102381568)))];
+            tensor<fp16, [1024]> decoder_upsample_1_0_conv_bias_to_fp16 = const()[name = string("decoder_upsample_1_0_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106575936)))];
+            tensor<fp16, [1, 1024, 1, 20]> x_67_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_upsample_1_0_conv_bias_to_fp16, dilations = x_67_dilations_0, groups = x_67_groups_0, output_shape = x_67_has_output_shape_output_shape_0, pad = x_67_pad_0, pad_type = x_67_pad_type_0, strides = x_67_strides_0, weight = decoder_upsample_1_0_conv_weight_to_fp16, x = x_65_cast_fp16)[name = string("x_67_has_output_shape_cast_fp16")];
+            tensor<int32, [8]> input_133_pad_0 = const()[name = string("input_133_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_133_mode_0 = const()[name = string("input_133_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 1, 26]> input_133_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = input_133_mode_0, pad = input_133_pad_0, x = x_67_has_output_shape_cast_fp16)[name = string("input_133_cast_fp16")];
+            string x_71_pad_type_0 = const()[name = string("x_71_pad_type_0"), val = string("valid")];
+            int32 x_71_groups_0 = const()[name = string("x_71_groups_0"), val = int32(1024)];
+            tensor<int32, [2]> x_71_strides_0 = const()[name = string("x_71_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_71_pad_0 = const()[name = string("x_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_71_dilations_0 = const()[name = string("x_71_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp16, [1024, 1, 1, 7]> decoder_upsample_1_1_dwconv_conv_weight_to_fp16 = const()[name = string("decoder_upsample_1_1_dwconv_conv_weight_to_fp16"), val = tensor<fp16, [1024, 1, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106578048)))];
+            tensor<fp16, [1024]> decoder_upsample_1_1_dwconv_conv_bias_to_fp16 = const()[name = string("decoder_upsample_1_1_dwconv_conv_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106592448)))];
+            tensor<fp16, [1, 1024, 1, 20]> x_71_cast_fp16 = conv(bias = decoder_upsample_1_1_dwconv_conv_bias_to_fp16, dilations = x_71_dilations_0, groups = x_71_groups_0, pad = x_71_pad_0, pad_type = x_71_pad_type_0, strides = x_71_strides_0, weight = decoder_upsample_1_1_dwconv_conv_weight_to_fp16, x = input_133_cast_fp16)[name = string("x_71_cast_fp16")];
+            tensor<int32, [1]> var_2231_axes_0 = const()[name = string("op_2231_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 20]> var_2231_cast_fp16 = squeeze(axes = var_2231_axes_0, x = x_71_cast_fp16)[name = string("op_2231_cast_fp16")];
+            tensor<int32, [3]> var_2232 = const()[name = string("op_2232"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> x_73_axes_0 = const()[name = string("x_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1024]> decoder_upsample_1_1_norm_weight_to_fp16 = const()[name = string("decoder_upsample_1_1_norm_weight_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106594560)))];
+            tensor<fp16, [1024]> decoder_upsample_1_1_norm_bias_to_fp16 = const()[name = string("decoder_upsample_1_1_norm_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106596672)))];
+            tensor<fp16, [1, 20, 1024]> input_135_cast_fp16 = transpose(perm = var_2232, x = var_2231_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 20, 1024]> x_73_cast_fp16 = layer_norm(axes = x_73_axes_0, beta = decoder_upsample_1_1_norm_bias_to_fp16, epsilon = var_17_to_fp16, gamma = decoder_upsample_1_1_norm_weight_to_fp16, x = input_135_cast_fp16)[name = string("x_73_cast_fp16")];
+            tensor<int32, [3]> var_2238 = const()[name = string("op_2238"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_137_axes_0 = const()[name = string("input_137_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1024, 20]> var_2239_cast_fp16 = transpose(perm = var_2238, x = x_73_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 1024, 1, 20]> input_137_cast_fp16 = expand_dims(axes = input_137_axes_0, x = var_2239_cast_fp16)[name = string("input_137_cast_fp16")];
+            string input_139_pad_type_0 = const()[name = string("input_139_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_139_strides_0 = const()[name = string("input_139_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_139_pad_0 = const()[name = string("input_139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_139_dilations_0 = const()[name = string("input_139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_139_groups_0 = const()[name = string("input_139_groups_0"), val = int32(1)];
+            tensor<fp16, [4096, 1024, 1, 1]> decoder_upsample_1_1_pwconv1_weight_to_fp16 = const()[name = string("decoder_upsample_1_1_pwconv1_weight_to_fp16"), val = tensor<fp16, [4096, 1024, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106598784)))];
+            tensor<fp16, [4096]> decoder_upsample_1_1_pwconv1_bias_to_fp16 = const()[name = string("decoder_upsample_1_1_pwconv1_bias_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114987456)))];
+            tensor<fp16, [1, 4096, 1, 20]> input_139_cast_fp16 = conv(bias = decoder_upsample_1_1_pwconv1_bias_to_fp16, dilations = input_139_dilations_0, groups = input_139_groups_0, pad = input_139_pad_0, pad_type = input_139_pad_type_0, strides = input_139_strides_0, weight = decoder_upsample_1_1_pwconv1_weight_to_fp16, x = input_137_cast_fp16)[name = string("input_139_cast_fp16")];
+            string input_141_mode_0 = const()[name = string("input_141_mode_0"), val = string("EXACT")];
+            tensor<fp16, [1, 4096, 1, 20]> input_141_cast_fp16 = gelu(mode = input_141_mode_0, x = input_139_cast_fp16)[name = string("input_141_cast_fp16")];
+            string x_75_pad_type_0 = const()[name = string("x_75_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_75_strides_0 = const()[name = string("x_75_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_75_pad_0 = const()[name = string("x_75_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_75_dilations_0 = const()[name = string("x_75_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_75_groups_0 = const()[name = string("x_75_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 4096, 1, 1]> x_77_weight_0_to_fp16 = const()[name = string("x_77_weight_0_to_fp16"), val = tensor<fp16, [1024, 4096, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114995712)))];
+            tensor<fp16, [1024]> x_77_bias_0_to_fp16 = const()[name = string("x_77_bias_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(123384384)))];
+            tensor<fp16, [1, 1024, 1, 20]> x_77_cast_fp16 = conv(bias = x_77_bias_0_to_fp16, dilations = x_75_dilations_0, groups = x_75_groups_0, pad = x_75_pad_0, pad_type = x_75_pad_type_0, strides = x_75_strides_0, weight = x_77_weight_0_to_fp16, x = input_141_cast_fp16)[name = string("x_77_cast_fp16")];
+            tensor<fp16, [1, 1024, 1, 20]> x_79_cast_fp16 = add(x = x_67_has_output_shape_cast_fp16, y = x_77_cast_fp16)[name = string("x_79_cast_fp16")];
+            tensor<int32, [8]> input_143_pad_0 = const()[name = string("input_143_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_143_mode_0 = const()[name = string("input_143_mode_0"), val = string("constant")];
+            fp16 const_89_to_fp16 = const()[name = string("const_89_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 1, 26]> input_143_cast_fp16 = pad(constant_val = const_89_to_fp16, mode = input_143_mode_0, pad = input_143_pad_0, x = x_79_cast_fp16)[name = string("input_143_cast_fp16")];
+            string x_81_pad_type_0 = const()[name = string("x_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_81_strides_0 = const()[name = string("x_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_81_pad_0 = const()[name = string("x_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_81_dilations_0 = const()[name = string("x_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_81_groups_0 = const()[name = string("x_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1536, 1024, 1, 7]> decoder_decoder_0_conv_weight_to_fp16 = const()[name = string("decoder_decoder_0_conv_weight_to_fp16"), val = tensor<fp16, [1536, 1024, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(123386496)))];
+            tensor<fp16, [1536]> decoder_decoder_0_conv_bias_to_fp16 = const()[name = string("decoder_decoder_0_conv_bias_to_fp16"), val = tensor<fp16, [1536]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145406656)))];
+            tensor<fp16, [1, 1536, 1, 20]> x_81_cast_fp16 = conv(bias = decoder_decoder_0_conv_bias_to_fp16, dilations = x_81_dilations_0, groups = x_81_groups_0, pad = x_81_pad_0, pad_type = x_81_pad_type_0, strides = x_81_strides_0, weight = decoder_decoder_0_conv_weight_to_fp16, x = input_143_cast_fp16)[name = string("x_81_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 1]> alpha_1_to_fp16 = const()[name = string("alpha_1_to_fp16"), val = tensor<fp16, [1, 1536, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145409792)))];
+            tensor<fp16, [1, 1536, 1, 20]> var_2288_cast_fp16 = mul(x = x_81_cast_fp16, y = alpha_1_to_fp16)[name = string("op_2288_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 20]> sin_val_1_cast_fp16 = sin(x = var_2288_cast_fp16)[name = string("sin_val_1_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 20]> var_2295_cast_fp16 = mul(x = sin_val_1_cast_fp16, y = sin_val_1_cast_fp16)[name = string("op_2295_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 1]> var_2292_to_fp16 = const()[name = string("op_2292_to_fp16"), val = tensor<fp16, [1, 1536, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145412928)))];
+            tensor<fp16, [1, 1536, 1, 20]> var_2296_cast_fp16 = mul(x = var_2292_to_fp16, y = var_2295_cast_fp16)[name = string("op_2296_cast_fp16")];
+            tensor<fp16, [1, 1536, 1, 20]> x_83_cast_fp16 = add(x = x_81_cast_fp16, y = var_2296_cast_fp16)[name = string("x_83_cast_fp16")];
+            string x_85_pad_type_0 = const()[name = string("x_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_85_strides_0 = const()[name = string("x_85_strides_0"), val = tensor<int32, [2]>([1, 8])];
+            tensor<int32, [4]> x_85_pad_0 = const()[name = string("x_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_85_dilations_0 = const()[name = string("x_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_85_groups_0 = const()[name = string("x_85_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_85_has_output_shape_output_shape_0 = const()[name = string("x_85_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 768, 1, 168])];
+            tensor<fp16, [1536, 768, 1, 16]> decoder_decoder_1_block_1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_1_conv_weight_to_fp16"), val = tensor<fp16, [1536, 768, 1, 16]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145416064)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(183164864)))];
+            tensor<fp16, [1, 768, 1, 168]> x_85_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_1_block_1_conv_bias_to_fp16, dilations = x_85_dilations_0, groups = x_85_groups_0, output_shape = x_85_has_output_shape_output_shape_0, pad = x_85_pad_0, pad_type = x_85_pad_type_0, strides = x_85_strides_0, weight = decoder_decoder_1_block_1_conv_weight_to_fp16, x = x_83_cast_fp16)[name = string("x_85_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_87_begin_0 = const()[name = string("x_87_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 8])];
+            tensor<int32, [4]> x_87_end_0 = const()[name = string("x_87_end_0"), val = tensor<int32, [4]>([1, 768, 1, 160])];
+            tensor<bool, [4]> x_87_end_mask_0 = const()[name = string("x_87_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 768, 1, 152]> x_87_cast_fp16 = slice_by_index(begin = x_87_begin_0, end = x_87_end_0, end_mask = x_87_end_mask_0, x = x_85_has_output_shape_cast_fp16)[name = string("x_87_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_3_to_fp16 = const()[name = string("alpha_3_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(183166464)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2328_cast_fp16 = mul(x = x_87_cast_fp16, y = alpha_3_to_fp16)[name = string("op_2328_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_3_cast_fp16 = sin(x = var_2328_cast_fp16)[name = string("sin_val_3_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2335_cast_fp16 = mul(x = sin_val_3_cast_fp16, y = sin_val_3_cast_fp16)[name = string("op_2335_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2332_to_fp16 = const()[name = string("op_2332_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(183168064)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2336_cast_fp16 = mul(x = var_2332_to_fp16, y = var_2335_cast_fp16)[name = string("op_2336_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_89_cast_fp16 = add(x = x_87_cast_fp16, y = var_2336_cast_fp16)[name = string("x_89_cast_fp16")];
+            tensor<int32, [8]> input_145_pad_0 = const()[name = string("input_145_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_145_mode_0 = const()[name = string("input_145_mode_0"), val = string("constant")];
+            fp16 const_91_to_fp16 = const()[name = string("const_91_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 768, 1, 158]> input_145_cast_fp16 = pad(constant_val = const_91_to_fp16, mode = input_145_mode_0, pad = input_145_pad_0, x = x_89_cast_fp16)[name = string("input_145_cast_fp16")];
+            string x_91_pad_type_0 = const()[name = string("x_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_91_strides_0 = const()[name = string("x_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_91_pad_0 = const()[name = string("x_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_91_dilations_0 = const()[name = string("x_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_91_groups_0 = const()[name = string("x_91_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 7]> decoder_decoder_1_block_2_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_2_conv1_conv_weight_to_fp16"), val = tensor<fp16, [768, 768, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(183169664)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191427264)))];
+            tensor<fp16, [1, 768, 1, 152]> x_91_cast_fp16 = conv(bias = decoder_decoder_1_block_2_conv1_conv_bias_to_fp16, dilations = x_91_dilations_0, groups = x_91_groups_0, pad = x_91_pad_0, pad_type = x_91_pad_type_0, strides = x_91_strides_0, weight = decoder_decoder_1_block_2_conv1_conv_weight_to_fp16, x = input_145_cast_fp16)[name = string("x_91_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_5_to_fp16 = const()[name = string("alpha_5_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191428864)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2356_cast_fp16 = mul(x = x_91_cast_fp16, y = alpha_5_to_fp16)[name = string("op_2356_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_5_cast_fp16 = sin(x = var_2356_cast_fp16)[name = string("sin_val_5_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2363_cast_fp16 = mul(x = sin_val_5_cast_fp16, y = sin_val_5_cast_fp16)[name = string("op_2363_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2360_to_fp16 = const()[name = string("op_2360_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191430464)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2364_cast_fp16 = mul(x = var_2360_to_fp16, y = var_2363_cast_fp16)[name = string("op_2364_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_93_cast_fp16 = add(x = x_91_cast_fp16, y = var_2364_cast_fp16)[name = string("x_93_cast_fp16")];
+            string x_95_pad_type_0 = const()[name = string("x_95_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_95_strides_0 = const()[name = string("x_95_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_95_pad_0 = const()[name = string("x_95_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_95_dilations_0 = const()[name = string("x_95_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_95_groups_0 = const()[name = string("x_95_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 1]> decoder_decoder_1_block_2_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_2_conv2_conv_weight_to_fp16"), val = tensor<fp16, [768, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191432064)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192611776)))];
+            tensor<fp16, [1, 768, 1, 152]> x_95_cast_fp16 = conv(bias = decoder_decoder_1_block_2_conv2_conv_bias_to_fp16, dilations = x_95_dilations_0, groups = x_95_groups_0, pad = x_95_pad_0, pad_type = x_95_pad_type_0, strides = x_95_strides_0, weight = decoder_decoder_1_block_2_conv2_conv_weight_to_fp16, x = x_93_cast_fp16)[name = string("x_95_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_97_cast_fp16 = add(x = x_95_cast_fp16, y = x_87_cast_fp16)[name = string("x_97_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_7_to_fp16 = const()[name = string("alpha_7_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192613376)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2389_cast_fp16 = mul(x = x_97_cast_fp16, y = alpha_7_to_fp16)[name = string("op_2389_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_7_cast_fp16 = sin(x = var_2389_cast_fp16)[name = string("sin_val_7_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2396_cast_fp16 = mul(x = sin_val_7_cast_fp16, y = sin_val_7_cast_fp16)[name = string("op_2396_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2393_to_fp16 = const()[name = string("op_2393_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192614976)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2397_cast_fp16 = mul(x = var_2393_to_fp16, y = var_2396_cast_fp16)[name = string("op_2397_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_99_cast_fp16 = add(x = x_97_cast_fp16, y = var_2397_cast_fp16)[name = string("x_99_cast_fp16")];
+            tensor<int32, [8]> input_149_pad_0 = const()[name = string("input_149_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_149_mode_0 = const()[name = string("input_149_mode_0"), val = string("constant")];
+            fp16 const_93_to_fp16 = const()[name = string("const_93_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 768, 1, 170]> input_149_cast_fp16 = pad(constant_val = const_93_to_fp16, mode = input_149_mode_0, pad = input_149_pad_0, x = x_99_cast_fp16)[name = string("input_149_cast_fp16")];
+            string x_101_pad_type_0 = const()[name = string("x_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_101_dilations_0 = const()[name = string("x_101_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_101_strides_0 = const()[name = string("x_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_101_pad_0 = const()[name = string("x_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_101_groups_0 = const()[name = string("x_101_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 7]> decoder_decoder_1_block_3_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_3_conv1_conv_weight_to_fp16"), val = tensor<fp16, [768, 768, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192616576)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200874176)))];
+            tensor<fp16, [1, 768, 1, 152]> x_101_cast_fp16 = conv(bias = decoder_decoder_1_block_3_conv1_conv_bias_to_fp16, dilations = x_101_dilations_0, groups = x_101_groups_0, pad = x_101_pad_0, pad_type = x_101_pad_type_0, strides = x_101_strides_0, weight = decoder_decoder_1_block_3_conv1_conv_weight_to_fp16, x = input_149_cast_fp16)[name = string("x_101_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_9_to_fp16 = const()[name = string("alpha_9_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200875776)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2417_cast_fp16 = mul(x = x_101_cast_fp16, y = alpha_9_to_fp16)[name = string("op_2417_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_9_cast_fp16 = sin(x = var_2417_cast_fp16)[name = string("sin_val_9_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2424_cast_fp16 = mul(x = sin_val_9_cast_fp16, y = sin_val_9_cast_fp16)[name = string("op_2424_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2421_to_fp16 = const()[name = string("op_2421_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200877376)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2425_cast_fp16 = mul(x = var_2421_to_fp16, y = var_2424_cast_fp16)[name = string("op_2425_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_103_cast_fp16 = add(x = x_101_cast_fp16, y = var_2425_cast_fp16)[name = string("x_103_cast_fp16")];
+            string x_105_pad_type_0 = const()[name = string("x_105_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_105_strides_0 = const()[name = string("x_105_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_105_pad_0 = const()[name = string("x_105_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_105_dilations_0 = const()[name = string("x_105_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_105_groups_0 = const()[name = string("x_105_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 1]> decoder_decoder_1_block_3_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_3_conv2_conv_weight_to_fp16"), val = tensor<fp16, [768, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200878976)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202058688)))];
+            tensor<fp16, [1, 768, 1, 152]> x_105_cast_fp16 = conv(bias = decoder_decoder_1_block_3_conv2_conv_bias_to_fp16, dilations = x_105_dilations_0, groups = x_105_groups_0, pad = x_105_pad_0, pad_type = x_105_pad_type_0, strides = x_105_strides_0, weight = decoder_decoder_1_block_3_conv2_conv_weight_to_fp16, x = x_103_cast_fp16)[name = string("x_105_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_107_cast_fp16 = add(x = x_105_cast_fp16, y = x_97_cast_fp16)[name = string("x_107_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_11_to_fp16 = const()[name = string("alpha_11_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202060288)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2450_cast_fp16 = mul(x = x_107_cast_fp16, y = alpha_11_to_fp16)[name = string("op_2450_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_11_cast_fp16 = sin(x = var_2450_cast_fp16)[name = string("sin_val_11_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2457_cast_fp16 = mul(x = sin_val_11_cast_fp16, y = sin_val_11_cast_fp16)[name = string("op_2457_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2454_to_fp16 = const()[name = string("op_2454_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202061888)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2458_cast_fp16 = mul(x = var_2454_to_fp16, y = var_2457_cast_fp16)[name = string("op_2458_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_109_cast_fp16 = add(x = x_107_cast_fp16, y = var_2458_cast_fp16)[name = string("x_109_cast_fp16")];
+            tensor<int32, [8]> input_153_pad_0 = const()[name = string("input_153_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_153_mode_0 = const()[name = string("input_153_mode_0"), val = string("constant")];
+            fp16 const_95_to_fp16 = const()[name = string("const_95_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 768, 1, 206]> input_153_cast_fp16 = pad(constant_val = const_95_to_fp16, mode = input_153_mode_0, pad = input_153_pad_0, x = x_109_cast_fp16)[name = string("input_153_cast_fp16")];
+            string x_111_pad_type_0 = const()[name = string("x_111_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_111_dilations_0 = const()[name = string("x_111_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_111_strides_0 = const()[name = string("x_111_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_111_pad_0 = const()[name = string("x_111_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_111_groups_0 = const()[name = string("x_111_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 7]> decoder_decoder_1_block_4_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_4_conv1_conv_weight_to_fp16"), val = tensor<fp16, [768, 768, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(202063488)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210321088)))];
+            tensor<fp16, [1, 768, 1, 152]> x_111_cast_fp16 = conv(bias = decoder_decoder_1_block_4_conv1_conv_bias_to_fp16, dilations = x_111_dilations_0, groups = x_111_groups_0, pad = x_111_pad_0, pad_type = x_111_pad_type_0, strides = x_111_strides_0, weight = decoder_decoder_1_block_4_conv1_conv_weight_to_fp16, x = input_153_cast_fp16)[name = string("x_111_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_13_to_fp16 = const()[name = string("alpha_13_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210322688)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2478_cast_fp16 = mul(x = x_111_cast_fp16, y = alpha_13_to_fp16)[name = string("op_2478_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_13_cast_fp16 = sin(x = var_2478_cast_fp16)[name = string("sin_val_13_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2485_cast_fp16 = mul(x = sin_val_13_cast_fp16, y = sin_val_13_cast_fp16)[name = string("op_2485_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2482_to_fp16 = const()[name = string("op_2482_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210324288)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2486_cast_fp16 = mul(x = var_2482_to_fp16, y = var_2485_cast_fp16)[name = string("op_2486_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_113_cast_fp16 = add(x = x_111_cast_fp16, y = var_2486_cast_fp16)[name = string("x_113_cast_fp16")];
+            string x_115_pad_type_0 = const()[name = string("x_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_115_strides_0 = const()[name = string("x_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_115_pad_0 = const()[name = string("x_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_115_dilations_0 = const()[name = string("x_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_115_groups_0 = const()[name = string("x_115_groups_0"), val = int32(1)];
+            tensor<fp16, [768, 768, 1, 1]> decoder_decoder_1_block_4_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_1_block_4_conv2_conv_weight_to_fp16"), val = tensor<fp16, [768, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210325888)))];
+            tensor<fp16, [768]> decoder_decoder_1_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_1_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211505600)))];
+            tensor<fp16, [1, 768, 1, 152]> x_115_cast_fp16 = conv(bias = decoder_decoder_1_block_4_conv2_conv_bias_to_fp16, dilations = x_115_dilations_0, groups = x_115_groups_0, pad = x_115_pad_0, pad_type = x_115_pad_type_0, strides = x_115_strides_0, weight = decoder_decoder_1_block_4_conv2_conv_weight_to_fp16, x = x_113_cast_fp16)[name = string("x_115_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_117_cast_fp16 = add(x = x_115_cast_fp16, y = x_107_cast_fp16)[name = string("x_117_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> alpha_15_to_fp16 = const()[name = string("alpha_15_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211507200)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2517_cast_fp16 = mul(x = x_117_cast_fp16, y = alpha_15_to_fp16)[name = string("op_2517_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> sin_val_15_cast_fp16 = sin(x = var_2517_cast_fp16)[name = string("sin_val_15_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> var_2524_cast_fp16 = mul(x = sin_val_15_cast_fp16, y = sin_val_15_cast_fp16)[name = string("op_2524_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 1]> var_2521_to_fp16 = const()[name = string("op_2521_to_fp16"), val = tensor<fp16, [1, 768, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211508800)))];
+            tensor<fp16, [1, 768, 1, 152]> var_2525_cast_fp16 = mul(x = var_2521_to_fp16, y = var_2524_cast_fp16)[name = string("op_2525_cast_fp16")];
+            tensor<fp16, [1, 768, 1, 152]> x_119_cast_fp16 = add(x = x_117_cast_fp16, y = var_2525_cast_fp16)[name = string("x_119_cast_fp16")];
+            string x_121_pad_type_0 = const()[name = string("x_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_121_strides_0 = const()[name = string("x_121_strides_0"), val = tensor<int32, [2]>([1, 5])];
+            tensor<int32, [4]> x_121_pad_0 = const()[name = string("x_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_121_dilations_0 = const()[name = string("x_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_121_groups_0 = const()[name = string("x_121_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_121_has_output_shape_output_shape_0 = const()[name = string("x_121_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 384, 1, 765])];
+            tensor<fp16, [768, 384, 1, 10]> decoder_decoder_2_block_1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_1_conv_weight_to_fp16"), val = tensor<fp16, [768, 384, 1, 10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211510400)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217408704)))];
+            tensor<fp16, [1, 384, 1, 765]> x_121_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_2_block_1_conv_bias_to_fp16, dilations = x_121_dilations_0, groups = x_121_groups_0, output_shape = x_121_has_output_shape_output_shape_0, pad = x_121_pad_0, pad_type = x_121_pad_type_0, strides = x_121_strides_0, weight = decoder_decoder_2_block_1_conv_weight_to_fp16, x = x_119_cast_fp16)[name = string("x_121_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_123_begin_0 = const()[name = string("x_123_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 5])];
+            tensor<int32, [4]> x_123_end_0 = const()[name = string("x_123_end_0"), val = tensor<int32, [4]>([1, 384, 1, 760])];
+            tensor<bool, [4]> x_123_end_mask_0 = const()[name = string("x_123_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 384, 1, 755]> x_123_cast_fp16 = slice_by_index(begin = x_123_begin_0, end = x_123_end_0, end_mask = x_123_end_mask_0, x = x_121_has_output_shape_cast_fp16)[name = string("x_123_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_17_to_fp16 = const()[name = string("alpha_17_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217409536)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2557_cast_fp16 = mul(x = x_123_cast_fp16, y = alpha_17_to_fp16)[name = string("op_2557_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_17_cast_fp16 = sin(x = var_2557_cast_fp16)[name = string("sin_val_17_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2564_cast_fp16 = mul(x = sin_val_17_cast_fp16, y = sin_val_17_cast_fp16)[name = string("op_2564_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2561_to_fp16 = const()[name = string("op_2561_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217410368)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2565_cast_fp16 = mul(x = var_2561_to_fp16, y = var_2564_cast_fp16)[name = string("op_2565_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_125_cast_fp16 = add(x = x_123_cast_fp16, y = var_2565_cast_fp16)[name = string("x_125_cast_fp16")];
+            tensor<int32, [8]> input_157_pad_0 = const()[name = string("input_157_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_157_mode_0 = const()[name = string("input_157_mode_0"), val = string("constant")];
+            fp16 const_98_to_fp16 = const()[name = string("const_98_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 384, 1, 761]> input_157_cast_fp16 = pad(constant_val = const_98_to_fp16, mode = input_157_mode_0, pad = input_157_pad_0, x = x_125_cast_fp16)[name = string("input_157_cast_fp16")];
+            string x_127_pad_type_0 = const()[name = string("x_127_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_127_strides_0 = const()[name = string("x_127_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_127_pad_0 = const()[name = string("x_127_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_127_dilations_0 = const()[name = string("x_127_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_127_groups_0 = const()[name = string("x_127_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 7]> decoder_decoder_2_block_2_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_2_conv1_conv_weight_to_fp16"), val = tensor<fp16, [384, 384, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217411200)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219475648)))];
+            tensor<fp16, [1, 384, 1, 755]> x_127_cast_fp16 = conv(bias = decoder_decoder_2_block_2_conv1_conv_bias_to_fp16, dilations = x_127_dilations_0, groups = x_127_groups_0, pad = x_127_pad_0, pad_type = x_127_pad_type_0, strides = x_127_strides_0, weight = decoder_decoder_2_block_2_conv1_conv_weight_to_fp16, x = input_157_cast_fp16)[name = string("x_127_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_19_to_fp16 = const()[name = string("alpha_19_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219476480)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2585_cast_fp16 = mul(x = x_127_cast_fp16, y = alpha_19_to_fp16)[name = string("op_2585_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_19_cast_fp16 = sin(x = var_2585_cast_fp16)[name = string("sin_val_19_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2592_cast_fp16 = mul(x = sin_val_19_cast_fp16, y = sin_val_19_cast_fp16)[name = string("op_2592_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2589_to_fp16 = const()[name = string("op_2589_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219477312)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2593_cast_fp16 = mul(x = var_2589_to_fp16, y = var_2592_cast_fp16)[name = string("op_2593_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_129_cast_fp16 = add(x = x_127_cast_fp16, y = var_2593_cast_fp16)[name = string("x_129_cast_fp16")];
+            string x_131_pad_type_0 = const()[name = string("x_131_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_131_strides_0 = const()[name = string("x_131_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_131_pad_0 = const()[name = string("x_131_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_131_dilations_0 = const()[name = string("x_131_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_131_groups_0 = const()[name = string("x_131_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 1]> decoder_decoder_2_block_2_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_2_conv2_conv_weight_to_fp16"), val = tensor<fp16, [384, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219478144)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219773120)))];
+            tensor<fp16, [1, 384, 1, 755]> x_131_cast_fp16 = conv(bias = decoder_decoder_2_block_2_conv2_conv_bias_to_fp16, dilations = x_131_dilations_0, groups = x_131_groups_0, pad = x_131_pad_0, pad_type = x_131_pad_type_0, strides = x_131_strides_0, weight = decoder_decoder_2_block_2_conv2_conv_weight_to_fp16, x = x_129_cast_fp16)[name = string("x_131_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_133_cast_fp16 = add(x = x_131_cast_fp16, y = x_123_cast_fp16)[name = string("x_133_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_21_to_fp16 = const()[name = string("alpha_21_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219773952)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2618_cast_fp16 = mul(x = x_133_cast_fp16, y = alpha_21_to_fp16)[name = string("op_2618_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_21_cast_fp16 = sin(x = var_2618_cast_fp16)[name = string("sin_val_21_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2625_cast_fp16 = mul(x = sin_val_21_cast_fp16, y = sin_val_21_cast_fp16)[name = string("op_2625_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2622_to_fp16 = const()[name = string("op_2622_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219774784)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2626_cast_fp16 = mul(x = var_2622_to_fp16, y = var_2625_cast_fp16)[name = string("op_2626_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_135_cast_fp16 = add(x = x_133_cast_fp16, y = var_2626_cast_fp16)[name = string("x_135_cast_fp16")];
+            tensor<int32, [8]> input_161_pad_0 = const()[name = string("input_161_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_161_mode_0 = const()[name = string("input_161_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 384, 1, 773]> input_161_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = input_161_mode_0, pad = input_161_pad_0, x = x_135_cast_fp16)[name = string("input_161_cast_fp16")];
+            string x_137_pad_type_0 = const()[name = string("x_137_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_137_dilations_0 = const()[name = string("x_137_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_137_strides_0 = const()[name = string("x_137_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_137_pad_0 = const()[name = string("x_137_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_137_groups_0 = const()[name = string("x_137_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 7]> decoder_decoder_2_block_3_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_3_conv1_conv_weight_to_fp16"), val = tensor<fp16, [384, 384, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(219775616)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(221840064)))];
+            tensor<fp16, [1, 384, 1, 755]> x_137_cast_fp16 = conv(bias = decoder_decoder_2_block_3_conv1_conv_bias_to_fp16, dilations = x_137_dilations_0, groups = x_137_groups_0, pad = x_137_pad_0, pad_type = x_137_pad_type_0, strides = x_137_strides_0, weight = decoder_decoder_2_block_3_conv1_conv_weight_to_fp16, x = input_161_cast_fp16)[name = string("x_137_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_23_to_fp16 = const()[name = string("alpha_23_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(221840896)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2646_cast_fp16 = mul(x = x_137_cast_fp16, y = alpha_23_to_fp16)[name = string("op_2646_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_23_cast_fp16 = sin(x = var_2646_cast_fp16)[name = string("sin_val_23_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2653_cast_fp16 = mul(x = sin_val_23_cast_fp16, y = sin_val_23_cast_fp16)[name = string("op_2653_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2650_to_fp16 = const()[name = string("op_2650_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(221841728)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2654_cast_fp16 = mul(x = var_2650_to_fp16, y = var_2653_cast_fp16)[name = string("op_2654_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_139_cast_fp16 = add(x = x_137_cast_fp16, y = var_2654_cast_fp16)[name = string("x_139_cast_fp16")];
+            string x_141_pad_type_0 = const()[name = string("x_141_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_141_strides_0 = const()[name = string("x_141_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_141_pad_0 = const()[name = string("x_141_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_141_dilations_0 = const()[name = string("x_141_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_141_groups_0 = const()[name = string("x_141_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 1]> decoder_decoder_2_block_3_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_3_conv2_conv_weight_to_fp16"), val = tensor<fp16, [384, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(221842560)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(222137536)))];
+            tensor<fp16, [1, 384, 1, 755]> x_141_cast_fp16 = conv(bias = decoder_decoder_2_block_3_conv2_conv_bias_to_fp16, dilations = x_141_dilations_0, groups = x_141_groups_0, pad = x_141_pad_0, pad_type = x_141_pad_type_0, strides = x_141_strides_0, weight = decoder_decoder_2_block_3_conv2_conv_weight_to_fp16, x = x_139_cast_fp16)[name = string("x_141_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_143_cast_fp16 = add(x = x_141_cast_fp16, y = x_133_cast_fp16)[name = string("x_143_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_25_to_fp16 = const()[name = string("alpha_25_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(222138368)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2679_cast_fp16 = mul(x = x_143_cast_fp16, y = alpha_25_to_fp16)[name = string("op_2679_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_25_cast_fp16 = sin(x = var_2679_cast_fp16)[name = string("sin_val_25_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2686_cast_fp16 = mul(x = sin_val_25_cast_fp16, y = sin_val_25_cast_fp16)[name = string("op_2686_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2683_to_fp16 = const()[name = string("op_2683_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(222139200)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2687_cast_fp16 = mul(x = var_2683_to_fp16, y = var_2686_cast_fp16)[name = string("op_2687_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_145_cast_fp16 = add(x = x_143_cast_fp16, y = var_2687_cast_fp16)[name = string("x_145_cast_fp16")];
+            tensor<int32, [8]> input_165_pad_0 = const()[name = string("input_165_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_165_mode_0 = const()[name = string("input_165_mode_0"), val = string("constant")];
+            fp16 const_102_to_fp16 = const()[name = string("const_102_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 384, 1, 809]> input_165_cast_fp16 = pad(constant_val = const_102_to_fp16, mode = input_165_mode_0, pad = input_165_pad_0, x = x_145_cast_fp16)[name = string("input_165_cast_fp16")];
+            string x_147_pad_type_0 = const()[name = string("x_147_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_147_dilations_0 = const()[name = string("x_147_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_147_strides_0 = const()[name = string("x_147_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_147_pad_0 = const()[name = string("x_147_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_147_groups_0 = const()[name = string("x_147_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 7]> decoder_decoder_2_block_4_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_4_conv1_conv_weight_to_fp16"), val = tensor<fp16, [384, 384, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(222140032)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224204480)))];
+            tensor<fp16, [1, 384, 1, 755]> x_147_cast_fp16 = conv(bias = decoder_decoder_2_block_4_conv1_conv_bias_to_fp16, dilations = x_147_dilations_0, groups = x_147_groups_0, pad = x_147_pad_0, pad_type = x_147_pad_type_0, strides = x_147_strides_0, weight = decoder_decoder_2_block_4_conv1_conv_weight_to_fp16, x = input_165_cast_fp16)[name = string("x_147_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_27_to_fp16 = const()[name = string("alpha_27_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224205312)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2707_cast_fp16 = mul(x = x_147_cast_fp16, y = alpha_27_to_fp16)[name = string("op_2707_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_27_cast_fp16 = sin(x = var_2707_cast_fp16)[name = string("sin_val_27_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2714_cast_fp16 = mul(x = sin_val_27_cast_fp16, y = sin_val_27_cast_fp16)[name = string("op_2714_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2711_to_fp16 = const()[name = string("op_2711_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224206144)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2715_cast_fp16 = mul(x = var_2711_to_fp16, y = var_2714_cast_fp16)[name = string("op_2715_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_149_cast_fp16 = add(x = x_147_cast_fp16, y = var_2715_cast_fp16)[name = string("x_149_cast_fp16")];
+            string x_151_pad_type_0 = const()[name = string("x_151_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_151_strides_0 = const()[name = string("x_151_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_151_pad_0 = const()[name = string("x_151_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_151_dilations_0 = const()[name = string("x_151_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_151_groups_0 = const()[name = string("x_151_groups_0"), val = int32(1)];
+            tensor<fp16, [384, 384, 1, 1]> decoder_decoder_2_block_4_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_2_block_4_conv2_conv_weight_to_fp16"), val = tensor<fp16, [384, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224206976)))];
+            tensor<fp16, [384]> decoder_decoder_2_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_2_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [384]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224501952)))];
+            tensor<fp16, [1, 384, 1, 755]> x_151_cast_fp16 = conv(bias = decoder_decoder_2_block_4_conv2_conv_bias_to_fp16, dilations = x_151_dilations_0, groups = x_151_groups_0, pad = x_151_pad_0, pad_type = x_151_pad_type_0, strides = x_151_strides_0, weight = decoder_decoder_2_block_4_conv2_conv_weight_to_fp16, x = x_149_cast_fp16)[name = string("x_151_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_153_cast_fp16 = add(x = x_151_cast_fp16, y = x_143_cast_fp16)[name = string("x_153_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> alpha_29_to_fp16 = const()[name = string("alpha_29_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224502784)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2746_cast_fp16 = mul(x = x_153_cast_fp16, y = alpha_29_to_fp16)[name = string("op_2746_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> sin_val_29_cast_fp16 = sin(x = var_2746_cast_fp16)[name = string("sin_val_29_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> var_2753_cast_fp16 = mul(x = sin_val_29_cast_fp16, y = sin_val_29_cast_fp16)[name = string("op_2753_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 1]> var_2750_to_fp16 = const()[name = string("op_2750_to_fp16"), val = tensor<fp16, [1, 384, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224503616)))];
+            tensor<fp16, [1, 384, 1, 755]> var_2754_cast_fp16 = mul(x = var_2750_to_fp16, y = var_2753_cast_fp16)[name = string("op_2754_cast_fp16")];
+            tensor<fp16, [1, 384, 1, 755]> x_155_cast_fp16 = add(x = x_153_cast_fp16, y = var_2754_cast_fp16)[name = string("x_155_cast_fp16")];
+            string x_157_pad_type_0 = const()[name = string("x_157_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_157_strides_0 = const()[name = string("x_157_strides_0"), val = tensor<int32, [2]>([1, 4])];
+            tensor<int32, [4]> x_157_pad_0 = const()[name = string("x_157_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_157_dilations_0 = const()[name = string("x_157_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_157_groups_0 = const()[name = string("x_157_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_157_has_output_shape_output_shape_0 = const()[name = string("x_157_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 192, 1, 3024])];
+            tensor<fp16, [384, 192, 1, 8]> decoder_decoder_3_block_1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_1_conv_weight_to_fp16"), val = tensor<fp16, [384, 192, 1, 8]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224504448)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225684160)))];
+            tensor<fp16, [1, 192, 1, 3024]> x_157_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_3_block_1_conv_bias_to_fp16, dilations = x_157_dilations_0, groups = x_157_groups_0, output_shape = x_157_has_output_shape_output_shape_0, pad = x_157_pad_0, pad_type = x_157_pad_type_0, strides = x_157_strides_0, weight = decoder_decoder_3_block_1_conv_weight_to_fp16, x = x_155_cast_fp16)[name = string("x_157_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_159_begin_0 = const()[name = string("x_159_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 4])];
+            tensor<int32, [4]> x_159_end_0 = const()[name = string("x_159_end_0"), val = tensor<int32, [4]>([1, 192, 1, 3020])];
+            tensor<bool, [4]> x_159_end_mask_0 = const()[name = string("x_159_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 192, 1, 3016]> x_159_cast_fp16 = slice_by_index(begin = x_159_begin_0, end = x_159_end_0, end_mask = x_159_end_mask_0, x = x_157_has_output_shape_cast_fp16)[name = string("x_159_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_31_to_fp16 = const()[name = string("alpha_31_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225684608)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2786_cast_fp16 = mul(x = x_159_cast_fp16, y = alpha_31_to_fp16)[name = string("op_2786_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_31_cast_fp16 = sin(x = var_2786_cast_fp16)[name = string("sin_val_31_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2793_cast_fp16 = mul(x = sin_val_31_cast_fp16, y = sin_val_31_cast_fp16)[name = string("op_2793_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2790_to_fp16 = const()[name = string("op_2790_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225685056)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2794_cast_fp16 = mul(x = var_2790_to_fp16, y = var_2793_cast_fp16)[name = string("op_2794_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_161_cast_fp16 = add(x = x_159_cast_fp16, y = var_2794_cast_fp16)[name = string("x_161_cast_fp16")];
+            tensor<int32, [8]> input_169_pad_0 = const()[name = string("input_169_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_169_mode_0 = const()[name = string("input_169_mode_0"), val = string("constant")];
+            fp16 const_105_to_fp16 = const()[name = string("const_105_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 192, 1, 3022]> input_169_cast_fp16 = pad(constant_val = const_105_to_fp16, mode = input_169_mode_0, pad = input_169_pad_0, x = x_161_cast_fp16)[name = string("input_169_cast_fp16")];
+            string x_163_pad_type_0 = const()[name = string("x_163_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_163_strides_0 = const()[name = string("x_163_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_163_pad_0 = const()[name = string("x_163_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_163_dilations_0 = const()[name = string("x_163_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_163_groups_0 = const()[name = string("x_163_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 7]> decoder_decoder_3_block_2_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_2_conv1_conv_weight_to_fp16"), val = tensor<fp16, [192, 192, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225685504)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226201664)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_163_cast_fp16 = conv(bias = decoder_decoder_3_block_2_conv1_conv_bias_to_fp16, dilations = x_163_dilations_0, groups = x_163_groups_0, pad = x_163_pad_0, pad_type = x_163_pad_type_0, strides = x_163_strides_0, weight = decoder_decoder_3_block_2_conv1_conv_weight_to_fp16, x = input_169_cast_fp16)[name = string("x_163_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_33_to_fp16 = const()[name = string("alpha_33_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226202112)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2814_cast_fp16 = mul(x = x_163_cast_fp16, y = alpha_33_to_fp16)[name = string("op_2814_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_33_cast_fp16 = sin(x = var_2814_cast_fp16)[name = string("sin_val_33_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2821_cast_fp16 = mul(x = sin_val_33_cast_fp16, y = sin_val_33_cast_fp16)[name = string("op_2821_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2818_to_fp16 = const()[name = string("op_2818_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226202560)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2822_cast_fp16 = mul(x = var_2818_to_fp16, y = var_2821_cast_fp16)[name = string("op_2822_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_165_cast_fp16 = add(x = x_163_cast_fp16, y = var_2822_cast_fp16)[name = string("x_165_cast_fp16")];
+            string x_167_pad_type_0 = const()[name = string("x_167_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_167_strides_0 = const()[name = string("x_167_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_167_pad_0 = const()[name = string("x_167_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_167_dilations_0 = const()[name = string("x_167_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_167_groups_0 = const()[name = string("x_167_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 1]> decoder_decoder_3_block_2_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_2_conv2_conv_weight_to_fp16"), val = tensor<fp16, [192, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226203008)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226276800)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_167_cast_fp16 = conv(bias = decoder_decoder_3_block_2_conv2_conv_bias_to_fp16, dilations = x_167_dilations_0, groups = x_167_groups_0, pad = x_167_pad_0, pad_type = x_167_pad_type_0, strides = x_167_strides_0, weight = decoder_decoder_3_block_2_conv2_conv_weight_to_fp16, x = x_165_cast_fp16)[name = string("x_167_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_169_cast_fp16 = add(x = x_167_cast_fp16, y = x_159_cast_fp16)[name = string("x_169_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_35_to_fp16 = const()[name = string("alpha_35_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226277248)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2847_cast_fp16 = mul(x = x_169_cast_fp16, y = alpha_35_to_fp16)[name = string("op_2847_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_35_cast_fp16 = sin(x = var_2847_cast_fp16)[name = string("sin_val_35_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2854_cast_fp16 = mul(x = sin_val_35_cast_fp16, y = sin_val_35_cast_fp16)[name = string("op_2854_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2851_to_fp16 = const()[name = string("op_2851_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226277696)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2855_cast_fp16 = mul(x = var_2851_to_fp16, y = var_2854_cast_fp16)[name = string("op_2855_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_171_cast_fp16 = add(x = x_169_cast_fp16, y = var_2855_cast_fp16)[name = string("x_171_cast_fp16")];
+            tensor<int32, [8]> input_173_pad_0 = const()[name = string("input_173_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_173_mode_0 = const()[name = string("input_173_mode_0"), val = string("constant")];
+            fp16 const_107_to_fp16 = const()[name = string("const_107_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 192, 1, 3034]> input_173_cast_fp16 = pad(constant_val = const_107_to_fp16, mode = input_173_mode_0, pad = input_173_pad_0, x = x_171_cast_fp16)[name = string("input_173_cast_fp16")];
+            string x_173_pad_type_0 = const()[name = string("x_173_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_173_dilations_0 = const()[name = string("x_173_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_173_strides_0 = const()[name = string("x_173_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_173_pad_0 = const()[name = string("x_173_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_173_groups_0 = const()[name = string("x_173_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 7]> decoder_decoder_3_block_3_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_3_conv1_conv_weight_to_fp16"), val = tensor<fp16, [192, 192, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226278144)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226794304)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_173_cast_fp16 = conv(bias = decoder_decoder_3_block_3_conv1_conv_bias_to_fp16, dilations = x_173_dilations_0, groups = x_173_groups_0, pad = x_173_pad_0, pad_type = x_173_pad_type_0, strides = x_173_strides_0, weight = decoder_decoder_3_block_3_conv1_conv_weight_to_fp16, x = input_173_cast_fp16)[name = string("x_173_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_37_to_fp16 = const()[name = string("alpha_37_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226794752)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2875_cast_fp16 = mul(x = x_173_cast_fp16, y = alpha_37_to_fp16)[name = string("op_2875_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_37_cast_fp16 = sin(x = var_2875_cast_fp16)[name = string("sin_val_37_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2882_cast_fp16 = mul(x = sin_val_37_cast_fp16, y = sin_val_37_cast_fp16)[name = string("op_2882_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2879_to_fp16 = const()[name = string("op_2879_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226795200)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2883_cast_fp16 = mul(x = var_2879_to_fp16, y = var_2882_cast_fp16)[name = string("op_2883_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_175_cast_fp16 = add(x = x_173_cast_fp16, y = var_2883_cast_fp16)[name = string("x_175_cast_fp16")];
+            string x_177_pad_type_0 = const()[name = string("x_177_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_177_strides_0 = const()[name = string("x_177_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_177_pad_0 = const()[name = string("x_177_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_177_dilations_0 = const()[name = string("x_177_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_177_groups_0 = const()[name = string("x_177_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 1]> decoder_decoder_3_block_3_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_3_conv2_conv_weight_to_fp16"), val = tensor<fp16, [192, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226795648)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226869440)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_177_cast_fp16 = conv(bias = decoder_decoder_3_block_3_conv2_conv_bias_to_fp16, dilations = x_177_dilations_0, groups = x_177_groups_0, pad = x_177_pad_0, pad_type = x_177_pad_type_0, strides = x_177_strides_0, weight = decoder_decoder_3_block_3_conv2_conv_weight_to_fp16, x = x_175_cast_fp16)[name = string("x_177_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_179_cast_fp16 = add(x = x_177_cast_fp16, y = x_169_cast_fp16)[name = string("x_179_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_39_to_fp16 = const()[name = string("alpha_39_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226869888)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2908_cast_fp16 = mul(x = x_179_cast_fp16, y = alpha_39_to_fp16)[name = string("op_2908_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_39_cast_fp16 = sin(x = var_2908_cast_fp16)[name = string("sin_val_39_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2915_cast_fp16 = mul(x = sin_val_39_cast_fp16, y = sin_val_39_cast_fp16)[name = string("op_2915_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2912_to_fp16 = const()[name = string("op_2912_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226870336)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2916_cast_fp16 = mul(x = var_2912_to_fp16, y = var_2915_cast_fp16)[name = string("op_2916_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_181_cast_fp16 = add(x = x_179_cast_fp16, y = var_2916_cast_fp16)[name = string("x_181_cast_fp16")];
+            tensor<int32, [8]> input_177_pad_0 = const()[name = string("input_177_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_177_mode_0 = const()[name = string("input_177_mode_0"), val = string("constant")];
+            fp16 const_109_to_fp16 = const()[name = string("const_109_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 192, 1, 3070]> input_177_cast_fp16 = pad(constant_val = const_109_to_fp16, mode = input_177_mode_0, pad = input_177_pad_0, x = x_181_cast_fp16)[name = string("input_177_cast_fp16")];
+            string x_183_pad_type_0 = const()[name = string("x_183_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_183_dilations_0 = const()[name = string("x_183_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_183_strides_0 = const()[name = string("x_183_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_183_pad_0 = const()[name = string("x_183_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_183_groups_0 = const()[name = string("x_183_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 7]> decoder_decoder_3_block_4_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_4_conv1_conv_weight_to_fp16"), val = tensor<fp16, [192, 192, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226870784)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227386944)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_183_cast_fp16 = conv(bias = decoder_decoder_3_block_4_conv1_conv_bias_to_fp16, dilations = x_183_dilations_0, groups = x_183_groups_0, pad = x_183_pad_0, pad_type = x_183_pad_type_0, strides = x_183_strides_0, weight = decoder_decoder_3_block_4_conv1_conv_weight_to_fp16, x = input_177_cast_fp16)[name = string("x_183_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_41_to_fp16 = const()[name = string("alpha_41_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227387392)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2936_cast_fp16 = mul(x = x_183_cast_fp16, y = alpha_41_to_fp16)[name = string("op_2936_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_41_cast_fp16 = sin(x = var_2936_cast_fp16)[name = string("sin_val_41_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2943_cast_fp16 = mul(x = sin_val_41_cast_fp16, y = sin_val_41_cast_fp16)[name = string("op_2943_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2940_to_fp16 = const()[name = string("op_2940_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227387840)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2944_cast_fp16 = mul(x = var_2940_to_fp16, y = var_2943_cast_fp16)[name = string("op_2944_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_185_cast_fp16 = add(x = x_183_cast_fp16, y = var_2944_cast_fp16)[name = string("x_185_cast_fp16")];
+            string x_187_pad_type_0 = const()[name = string("x_187_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_187_strides_0 = const()[name = string("x_187_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_187_pad_0 = const()[name = string("x_187_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_187_dilations_0 = const()[name = string("x_187_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_187_groups_0 = const()[name = string("x_187_groups_0"), val = int32(1)];
+            tensor<fp16, [192, 192, 1, 1]> decoder_decoder_3_block_4_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_3_block_4_conv2_conv_weight_to_fp16"), val = tensor<fp16, [192, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227388288)))];
+            tensor<fp16, [192]> decoder_decoder_3_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_3_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [192]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227462080)))];
+            tensor<fp16, [1, 192, 1, 3016]> x_187_cast_fp16 = conv(bias = decoder_decoder_3_block_4_conv2_conv_bias_to_fp16, dilations = x_187_dilations_0, groups = x_187_groups_0, pad = x_187_pad_0, pad_type = x_187_pad_type_0, strides = x_187_strides_0, weight = decoder_decoder_3_block_4_conv2_conv_weight_to_fp16, x = x_185_cast_fp16)[name = string("x_187_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_189_cast_fp16 = add(x = x_187_cast_fp16, y = x_179_cast_fp16)[name = string("x_189_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> alpha_43_to_fp16 = const()[name = string("alpha_43_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227462528)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2975_cast_fp16 = mul(x = x_189_cast_fp16, y = alpha_43_to_fp16)[name = string("op_2975_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> sin_val_43_cast_fp16 = sin(x = var_2975_cast_fp16)[name = string("sin_val_43_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> var_2982_cast_fp16 = mul(x = sin_val_43_cast_fp16, y = sin_val_43_cast_fp16)[name = string("op_2982_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 1]> var_2979_to_fp16 = const()[name = string("op_2979_to_fp16"), val = tensor<fp16, [1, 192, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227462976)))];
+            tensor<fp16, [1, 192, 1, 3016]> var_2983_cast_fp16 = mul(x = var_2979_to_fp16, y = var_2982_cast_fp16)[name = string("op_2983_cast_fp16")];
+            tensor<fp16, [1, 192, 1, 3016]> x_191_cast_fp16 = add(x = x_189_cast_fp16, y = var_2983_cast_fp16)[name = string("x_191_cast_fp16")];
+            string x_193_pad_type_0 = const()[name = string("x_193_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_193_strides_0 = const()[name = string("x_193_strides_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [4]> x_193_pad_0 = const()[name = string("x_193_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_193_dilations_0 = const()[name = string("x_193_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_193_groups_0 = const()[name = string("x_193_groups_0"), val = int32(1)];
+            tensor<int32, [4]> x_193_has_output_shape_output_shape_0 = const()[name = string("x_193_has_output_shape_output_shape_0"), val = tensor<int32, [4]>([1, 96, 1, 9051])];
+            tensor<fp16, [192, 96, 1, 6]> decoder_decoder_4_block_1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_1_conv_weight_to_fp16"), val = tensor<fp16, [192, 96, 1, 6]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227463424)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227684672)))];
+            tensor<fp16, [1, 96, 1, 9051]> x_193_has_output_shape_cast_fp16 = conv_transpose(bias = decoder_decoder_4_block_1_conv_bias_to_fp16, dilations = x_193_dilations_0, groups = x_193_groups_0, output_shape = x_193_has_output_shape_output_shape_0, pad = x_193_pad_0, pad_type = x_193_pad_type_0, strides = x_193_strides_0, weight = decoder_decoder_4_block_1_conv_weight_to_fp16, x = x_191_cast_fp16)[name = string("x_193_has_output_shape_cast_fp16")];
+            tensor<int32, [4]> x_195_begin_0 = const()[name = string("x_195_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3])];
+            tensor<int32, [4]> x_195_end_0 = const()[name = string("x_195_end_0"), val = tensor<int32, [4]>([1, 96, 1, 9048])];
+            tensor<bool, [4]> x_195_end_mask_0 = const()[name = string("x_195_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 96, 1, 9045]> x_195_cast_fp16 = slice_by_index(begin = x_195_begin_0, end = x_195_end_0, end_mask = x_195_end_mask_0, x = x_193_has_output_shape_cast_fp16)[name = string("x_195_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_45_to_fp16 = const()[name = string("alpha_45_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227684928)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3015_cast_fp16 = mul(x = x_195_cast_fp16, y = alpha_45_to_fp16)[name = string("op_3015_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_45_cast_fp16 = sin(x = var_3015_cast_fp16)[name = string("sin_val_45_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3022_cast_fp16 = mul(x = sin_val_45_cast_fp16, y = sin_val_45_cast_fp16)[name = string("op_3022_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3019_to_fp16 = const()[name = string("op_3019_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227685184)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3023_cast_fp16 = mul(x = var_3019_to_fp16, y = var_3022_cast_fp16)[name = string("op_3023_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_197_cast_fp16 = add(x = x_195_cast_fp16, y = var_3023_cast_fp16)[name = string("x_197_cast_fp16")];
+            tensor<int32, [8]> input_181_pad_0 = const()[name = string("input_181_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_181_mode_0 = const()[name = string("input_181_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9051]> input_181_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = input_181_mode_0, pad = input_181_pad_0, x = x_197_cast_fp16)[name = string("input_181_cast_fp16")];
+            string x_199_pad_type_0 = const()[name = string("x_199_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_199_strides_0 = const()[name = string("x_199_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_199_pad_0 = const()[name = string("x_199_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_199_dilations_0 = const()[name = string("x_199_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_199_groups_0 = const()[name = string("x_199_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 7]> decoder_decoder_4_block_2_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_2_conv1_conv_weight_to_fp16"), val = tensor<fp16, [96, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227685440)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_2_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_2_conv1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227814528)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_199_cast_fp16 = conv(bias = decoder_decoder_4_block_2_conv1_conv_bias_to_fp16, dilations = x_199_dilations_0, groups = x_199_groups_0, pad = x_199_pad_0, pad_type = x_199_pad_type_0, strides = x_199_strides_0, weight = decoder_decoder_4_block_2_conv1_conv_weight_to_fp16, x = input_181_cast_fp16)[name = string("x_199_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_47_to_fp16 = const()[name = string("alpha_47_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227814784)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3043_cast_fp16 = mul(x = x_199_cast_fp16, y = alpha_47_to_fp16)[name = string("op_3043_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_47_cast_fp16 = sin(x = var_3043_cast_fp16)[name = string("sin_val_47_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3050_cast_fp16 = mul(x = sin_val_47_cast_fp16, y = sin_val_47_cast_fp16)[name = string("op_3050_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3047_to_fp16 = const()[name = string("op_3047_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227815040)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3051_cast_fp16 = mul(x = var_3047_to_fp16, y = var_3050_cast_fp16)[name = string("op_3051_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_201_cast_fp16 = add(x = x_199_cast_fp16, y = var_3051_cast_fp16)[name = string("x_201_cast_fp16")];
+            string x_203_pad_type_0 = const()[name = string("x_203_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_203_strides_0 = const()[name = string("x_203_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_203_pad_0 = const()[name = string("x_203_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_203_dilations_0 = const()[name = string("x_203_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_203_groups_0 = const()[name = string("x_203_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 1]> decoder_decoder_4_block_2_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_2_conv2_conv_weight_to_fp16"), val = tensor<fp16, [96, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227815296)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_2_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_2_conv2_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227833792)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_203_cast_fp16 = conv(bias = decoder_decoder_4_block_2_conv2_conv_bias_to_fp16, dilations = x_203_dilations_0, groups = x_203_groups_0, pad = x_203_pad_0, pad_type = x_203_pad_type_0, strides = x_203_strides_0, weight = decoder_decoder_4_block_2_conv2_conv_weight_to_fp16, x = x_201_cast_fp16)[name = string("x_203_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_205_cast_fp16 = add(x = x_203_cast_fp16, y = x_195_cast_fp16)[name = string("x_205_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_49_to_fp16 = const()[name = string("alpha_49_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227834048)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3076_cast_fp16 = mul(x = x_205_cast_fp16, y = alpha_49_to_fp16)[name = string("op_3076_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_49_cast_fp16 = sin(x = var_3076_cast_fp16)[name = string("sin_val_49_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3083_cast_fp16 = mul(x = sin_val_49_cast_fp16, y = sin_val_49_cast_fp16)[name = string("op_3083_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3080_to_fp16 = const()[name = string("op_3080_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227834304)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3084_cast_fp16 = mul(x = var_3080_to_fp16, y = var_3083_cast_fp16)[name = string("op_3084_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_207_cast_fp16 = add(x = x_205_cast_fp16, y = var_3084_cast_fp16)[name = string("x_207_cast_fp16")];
+            tensor<int32, [8]> input_185_pad_0 = const()[name = string("input_185_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 18, 0])];
+            string input_185_mode_0 = const()[name = string("input_185_mode_0"), val = string("constant")];
+            fp16 const_114_to_fp16 = const()[name = string("const_114_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9063]> input_185_cast_fp16 = pad(constant_val = const_114_to_fp16, mode = input_185_mode_0, pad = input_185_pad_0, x = x_207_cast_fp16)[name = string("input_185_cast_fp16")];
+            string x_209_pad_type_0 = const()[name = string("x_209_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_209_dilations_0 = const()[name = string("x_209_dilations_0"), val = tensor<int32, [2]>([1, 3])];
+            tensor<int32, [2]> x_209_strides_0 = const()[name = string("x_209_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_209_pad_0 = const()[name = string("x_209_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_209_groups_0 = const()[name = string("x_209_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 7]> decoder_decoder_4_block_3_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_3_conv1_conv_weight_to_fp16"), val = tensor<fp16, [96, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227834560)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_3_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_3_conv1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227963648)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_209_cast_fp16 = conv(bias = decoder_decoder_4_block_3_conv1_conv_bias_to_fp16, dilations = x_209_dilations_0, groups = x_209_groups_0, pad = x_209_pad_0, pad_type = x_209_pad_type_0, strides = x_209_strides_0, weight = decoder_decoder_4_block_3_conv1_conv_weight_to_fp16, x = input_185_cast_fp16)[name = string("x_209_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_51_to_fp16 = const()[name = string("alpha_51_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227963904)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3104_cast_fp16 = mul(x = x_209_cast_fp16, y = alpha_51_to_fp16)[name = string("op_3104_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_51_cast_fp16 = sin(x = var_3104_cast_fp16)[name = string("sin_val_51_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3111_cast_fp16 = mul(x = sin_val_51_cast_fp16, y = sin_val_51_cast_fp16)[name = string("op_3111_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3108_to_fp16 = const()[name = string("op_3108_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227964160)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3112_cast_fp16 = mul(x = var_3108_to_fp16, y = var_3111_cast_fp16)[name = string("op_3112_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_211_cast_fp16 = add(x = x_209_cast_fp16, y = var_3112_cast_fp16)[name = string("x_211_cast_fp16")];
+            string x_213_pad_type_0 = const()[name = string("x_213_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_213_strides_0 = const()[name = string("x_213_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_213_pad_0 = const()[name = string("x_213_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_213_dilations_0 = const()[name = string("x_213_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_213_groups_0 = const()[name = string("x_213_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 1]> decoder_decoder_4_block_3_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_3_conv2_conv_weight_to_fp16"), val = tensor<fp16, [96, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227964416)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_3_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_3_conv2_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227982912)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_213_cast_fp16 = conv(bias = decoder_decoder_4_block_3_conv2_conv_bias_to_fp16, dilations = x_213_dilations_0, groups = x_213_groups_0, pad = x_213_pad_0, pad_type = x_213_pad_type_0, strides = x_213_strides_0, weight = decoder_decoder_4_block_3_conv2_conv_weight_to_fp16, x = x_211_cast_fp16)[name = string("x_213_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_215_cast_fp16 = add(x = x_213_cast_fp16, y = x_205_cast_fp16)[name = string("x_215_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_53_to_fp16 = const()[name = string("alpha_53_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227983168)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3137_cast_fp16 = mul(x = x_215_cast_fp16, y = alpha_53_to_fp16)[name = string("op_3137_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_53_cast_fp16 = sin(x = var_3137_cast_fp16)[name = string("sin_val_53_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3144_cast_fp16 = mul(x = sin_val_53_cast_fp16, y = sin_val_53_cast_fp16)[name = string("op_3144_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3141_to_fp16 = const()[name = string("op_3141_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227983424)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3145_cast_fp16 = mul(x = var_3141_to_fp16, y = var_3144_cast_fp16)[name = string("op_3145_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_217_cast_fp16 = add(x = x_215_cast_fp16, y = var_3145_cast_fp16)[name = string("x_217_cast_fp16")];
+            tensor<int32, [8]> input_189_pad_0 = const()[name = string("input_189_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 54, 0])];
+            string input_189_mode_0 = const()[name = string("input_189_mode_0"), val = string("constant")];
+            fp16 const_116_to_fp16 = const()[name = string("const_116_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9099]> input_189_cast_fp16 = pad(constant_val = const_116_to_fp16, mode = input_189_mode_0, pad = input_189_pad_0, x = x_217_cast_fp16)[name = string("input_189_cast_fp16")];
+            string x_219_pad_type_0 = const()[name = string("x_219_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_219_dilations_0 = const()[name = string("x_219_dilations_0"), val = tensor<int32, [2]>([1, 9])];
+            tensor<int32, [2]> x_219_strides_0 = const()[name = string("x_219_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_219_pad_0 = const()[name = string("x_219_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            int32 x_219_groups_0 = const()[name = string("x_219_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 7]> decoder_decoder_4_block_4_conv1_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_4_conv1_conv_weight_to_fp16"), val = tensor<fp16, [96, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227983680)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_4_conv1_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_4_conv1_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228112768)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_219_cast_fp16 = conv(bias = decoder_decoder_4_block_4_conv1_conv_bias_to_fp16, dilations = x_219_dilations_0, groups = x_219_groups_0, pad = x_219_pad_0, pad_type = x_219_pad_type_0, strides = x_219_strides_0, weight = decoder_decoder_4_block_4_conv1_conv_weight_to_fp16, x = input_189_cast_fp16)[name = string("x_219_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_55_to_fp16 = const()[name = string("alpha_55_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228113024)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3165_cast_fp16 = mul(x = x_219_cast_fp16, y = alpha_55_to_fp16)[name = string("op_3165_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_55_cast_fp16 = sin(x = var_3165_cast_fp16)[name = string("sin_val_55_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3172_cast_fp16 = mul(x = sin_val_55_cast_fp16, y = sin_val_55_cast_fp16)[name = string("op_3172_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3169_to_fp16 = const()[name = string("op_3169_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228113280)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3173_cast_fp16 = mul(x = var_3169_to_fp16, y = var_3172_cast_fp16)[name = string("op_3173_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_221_cast_fp16 = add(x = x_219_cast_fp16, y = var_3173_cast_fp16)[name = string("x_221_cast_fp16")];
+            string x_223_pad_type_0 = const()[name = string("x_223_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> x_223_strides_0 = const()[name = string("x_223_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> x_223_pad_0 = const()[name = string("x_223_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> x_223_dilations_0 = const()[name = string("x_223_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 x_223_groups_0 = const()[name = string("x_223_groups_0"), val = int32(1)];
+            tensor<fp16, [96, 96, 1, 1]> decoder_decoder_4_block_4_conv2_conv_weight_to_fp16 = const()[name = string("decoder_decoder_4_block_4_conv2_conv_weight_to_fp16"), val = tensor<fp16, [96, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228113536)))];
+            tensor<fp16, [96]> decoder_decoder_4_block_4_conv2_conv_bias_to_fp16 = const()[name = string("decoder_decoder_4_block_4_conv2_conv_bias_to_fp16"), val = tensor<fp16, [96]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228132032)))];
+            tensor<fp16, [1, 96, 1, 9045]> x_223_cast_fp16 = conv(bias = decoder_decoder_4_block_4_conv2_conv_bias_to_fp16, dilations = x_223_dilations_0, groups = x_223_groups_0, pad = x_223_pad_0, pad_type = x_223_pad_type_0, strides = x_223_strides_0, weight = decoder_decoder_4_block_4_conv2_conv_weight_to_fp16, x = x_221_cast_fp16)[name = string("x_223_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_225_cast_fp16 = add(x = x_223_cast_fp16, y = x_215_cast_fp16)[name = string("x_225_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> alpha_57_to_fp16 = const()[name = string("alpha_57_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228132288)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3194_cast_fp16 = mul(x = x_225_cast_fp16, y = alpha_57_to_fp16)[name = string("op_3194_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> sin_val_cast_fp16 = sin(x = var_3194_cast_fp16)[name = string("sin_val_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> var_3201_cast_fp16 = mul(x = sin_val_cast_fp16, y = sin_val_cast_fp16)[name = string("op_3201_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 1]> var_3198_to_fp16 = const()[name = string("op_3198_to_fp16"), val = tensor<fp16, [1, 96, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228132544)))];
+            tensor<fp16, [1, 96, 1, 9045]> var_3202_cast_fp16 = mul(x = var_3198_to_fp16, y = var_3201_cast_fp16)[name = string("op_3202_cast_fp16")];
+            tensor<fp16, [1, 96, 1, 9045]> x_cast_fp16 = add(x = x_225_cast_fp16, y = var_3202_cast_fp16)[name = string("x_cast_fp16")];
+            tensor<int32, [8]> input_pad_0 = const()[name = string("input_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 6, 0])];
+            string input_mode_0 = const()[name = string("input_mode_0"), val = string("constant")];
+            fp16 const_118_to_fp16 = const()[name = string("const_118_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 96, 1, 9051]> input_cast_fp16 = pad(constant_val = const_118_to_fp16, mode = input_mode_0, pad = input_pad_0, x = x_cast_fp16)[name = string("input_cast_fp16")];
+            string h_1_pad_type_0 = const()[name = string("h_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> h_1_strides_0 = const()[name = string("h_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> h_1_pad_0 = const()[name = string("h_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> h_1_dilations_0 = const()[name = string("h_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 h_1_groups_0 = const()[name = string("h_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 96, 1, 7]> decoder_decoder_6_conv_weight_to_fp16 = const()[name = string("decoder_decoder_6_conv_weight_to_fp16"), val = tensor<fp16, [1, 96, 1, 7]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228132800)))];
+            tensor<fp16, [1]> decoder_decoder_6_conv_bias_to_fp16 = const()[name = string("decoder_decoder_6_conv_bias_to_fp16"), val = tensor<fp16, [1]>([-0x1.1p-19])];
+            tensor<fp16, [1, 1, 1, 9045]> h_1_cast_fp16 = conv(bias = decoder_decoder_6_conv_bias_to_fp16, dilations = h_1_dilations_0, groups = h_1_groups_0, pad = h_1_pad_0, pad_type = h_1_pad_type_0, strides = h_1_strides_0, weight = decoder_decoder_6_conv_weight_to_fp16, x = input_cast_fp16)[name = string("h_1_cast_fp16")];
+            fp16 var_28_promoted_to_fp16 = const()[name = string("op_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            fp16 var_32_promoted_16_to_fp16 = const()[name = string("op_32_promoted_16_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 1, 9045]> clip_16_cast_fp16 = clip(alpha = var_28_promoted_to_fp16, beta = var_32_promoted_16_to_fp16, x = h_1_cast_fp16)[name = string("clip_16_cast_fp16")];
+            tensor<int32, [4]> var_3215_begin_0 = const()[name = string("op_3215_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 7125])];
+            tensor<int32, [4]> var_3215_end_0 = const()[name = string("op_3215_end_0"), val = tensor<int32, [4]>([1, 1, 1, 9045])];
+            tensor<bool, [4]> var_3215_end_mask_0 = const()[name = string("op_3215_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 1, 1920]> audio = slice_by_index(begin = var_3215_begin_0, end = var_3215_end_0, end_mask = var_3215_end_mask_0, x = clip_16_cast_fp16)[name = string("op_3215_cast_fp16")];
+        } -> (audio, key_cache_updates, value_cache_updates, hidden_context_update);
+}
\ No newline at end of file
diff --git a/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/weights/weight.bin b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..991e8396e3bb75413ace07a30f923ff839676662
--- /dev/null
+++ b/qwen3_tts/speech_decoder/12hz-1.7b-customvoice/W8A16/SpeechDecoder.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c90dc149272173de6cdae46826e1ef4265c53b8477b1de19c02843d4e38729e7
+size 228134208
diff --git a/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/analytics/coremldata.bin b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..006838f655fe86feaa6d24915a6771fffcc03a46
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20101f4ff618a0dfc2490dbd518b23f7581f197b758339d5fdbf9b0e0cf6172a
+size 243
diff --git a/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/coremldata.bin b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1d547f8bba3f6633f73c448e5f7ee4fe95901869
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d761ddb34a6cd7b72c38d9528a0daa2ecfb5f91f302d221e68e5c727278b1b29
+size 380
diff --git a/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/metadata.json b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d28fd417cddf1511224f14cf40a50e14b526ea4
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/metadata.json
@@ -0,0 +1,68 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Select" : 2,
+      "Ios18.conv" : 2,
+      "Ios18.gather" : 1,
+      "Ios18.silu" : 1,
+      "Ios18.constexprLutToDense" : 3,
+      "Ios18.expandDims" : 2,
+      "Ios18.greaterEqual" : 2,
+      "Ios18.add" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-17",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "TextProjector_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/model.mil b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..4d2efe2c93c903f20de66dd6f2e5d29575a54d6b
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/model.mil
@@ -0,0 +1,42 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> input_ids) {
+            int32 embeddings_batch_dims_0 = const()[name = string("embeddings_batch_dims_0"), val = int32(0)];
+            bool embeddings_validate_indices_0 = const()[name = string("embeddings_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [151936, 2048]> text_embedding_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [151936, 2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311165056))))[name = string("text_embedding_weight_to_fp16_palettized")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = input_ids, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(151936)];
+            tensor<int32, [1]> add_0 = add(x = input_ids, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = input_ids, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = select_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(151936)];
+            tensor<int32, [1]> add_0_1 = add(x = select_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = select_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 embeddings_cast_fp16_axis_0 = const()[name = string("embeddings_cast_fp16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 2048]> embeddings_cast_fp16 = gather(axis = embeddings_cast_fp16_axis_0, batch_dims = embeddings_batch_dims_0, indices = select_0_1, validate_indices = embeddings_validate_indices_0, x = text_embedding_weight_to_fp16_palettized)[name = string("embeddings_cast_fp16")];
+            tensor<int32, [1]> var_19_axes_0 = const()[name = string("op_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2048, 1]> var_19_cast_fp16 = expand_dims(axes = var_19_axes_0, x = embeddings_cast_fp16)[name = string("op_19_cast_fp16")];
+            tensor<int32, [1]> input_1_axes_0 = const()[name = string("input_1_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = expand_dims(axes = input_1_axes_0, x = var_19_cast_fp16)[name = string("input_1_cast_fp16")];
+            string input_3_pad_type_0 = const()[name = string("input_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_3_strides_0 = const()[name = string("input_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_3_pad_0 = const()[name = string("input_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_3_dilations_0 = const()[name = string("input_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_3_groups_0 = const()[name = string("input_3_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> text_projection_linear_fc1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311165632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(315360000))))[name = string("text_projection_linear_fc1_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> text_projection_linear_fc1_bias_to_fp16 = const()[name = string("text_projection_linear_fc1_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(315360576)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_3_cast_fp16 = conv(bias = text_projection_linear_fc1_bias_to_fp16, dilations = input_3_dilations_0, groups = input_3_groups_0, pad = input_3_pad_0, pad_type = input_3_pad_type_0, strides = input_3_strides_0, weight = text_projection_linear_fc1_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("input_3_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> input_cast_fp16 = silu(x = input_3_cast_fp16)[name = string("input_cast_fp16")];
+            string var_42_pad_type_0 = const()[name = string("op_42_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_42_strides_0 = const()[name = string("op_42_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_42_pad_0 = const()[name = string("op_42_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_42_dilations_0 = const()[name = string("op_42_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_42_groups_0 = const()[name = string("op_42_groups_0"), val = int32(1)];
+            tensor<fp16, [1024, 2048, 1, 1]> text_projection_linear_fc2_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [1024, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(315364736))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(317461952))))[name = string("text_projection_linear_fc2_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> text_projection_linear_fc2_bias_to_fp16 = const()[name = string("text_projection_linear_fc2_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(317462528)))];
+            tensor<fp16, [1, 1024, 1, 1]> input_embeds = conv(bias = text_projection_linear_fc2_bias_to_fp16, dilations = var_42_dilations_0, groups = var_42_groups_0, pad = var_42_pad_0, pad_type = var_42_pad_type_0, strides = var_42_strides_0, weight = text_projection_linear_fc2_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("op_42_cast_fp16")];
+        } -> (input_embeds);
+}
\ No newline at end of file
diff --git a/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/weights/weight.bin b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f706935ce2197434012e917d04ae402ff8a624bc
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-0.6b-customvoice/W8A16/TextProjector.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8185d1c91fd4ac7c782acccac91869298251539e45221da908bf3cc504a8360
+size 317464640
diff --git a/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/analytics/coremldata.bin b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3214e13ee8288c70a4ee9fb072d35b423e171f73
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:805ecdabccc61590d7f244e8a833422ab508c1ef539187703cab074b37241598
+size 243
diff --git a/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/coremldata.bin b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e6117e23c4c0ef1206e3a843de5a40868507d143
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b282c723b4842c34722e86c20dfcec29e906d846741fcdbb0168ea0abe06413
+size 380
diff --git a/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/metadata.json b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..dda858001613557a4de89fefd1270d352dc6c436
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/metadata.json
@@ -0,0 +1,68 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Palettized (8 bits), UInt8)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2048 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 2048, 1, 1]",
+        "name" : "input_embeds",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Select" : 2,
+      "Ios18.conv" : 2,
+      "Ios18.gather" : 1,
+      "Ios18.silu" : 1,
+      "Ios18.constexprLutToDense" : 3,
+      "Ios18.expandDims" : 2,
+      "Ios18.greaterEqual" : 2,
+      "Ios18.add" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-02-17",
+      "com.github.apple.coremltools.source" : "torch==2.8.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "TextProjector_8_bit",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/model.mil b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..924599a2f3d1179e9220d25838d49a356ad8af46
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/model.mil
@@ -0,0 +1,42 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<int32, [1]> input_ids) {
+            int32 embeddings_batch_dims_0 = const()[name = string("embeddings_batch_dims_0"), val = int32(0)];
+            bool embeddings_validate_indices_0 = const()[name = string("embeddings_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [151936, 2048]> text_embedding_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [151936, 2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311165056))))[name = string("text_embedding_weight_to_fp16_palettized")];
+            int32 greater_equal_0_y_0 = const()[name = string("greater_equal_0_y_0"), val = int32(0)];
+            tensor<bool, [1]> greater_equal_0 = greater_equal(x = input_ids, y = greater_equal_0_y_0)[name = string("greater_equal_0")];
+            int32 slice_by_index_0 = const()[name = string("slice_by_index_0"), val = int32(151936)];
+            tensor<int32, [1]> add_0 = add(x = input_ids, y = slice_by_index_0)[name = string("add_0")];
+            tensor<int32, [1]> select_0 = select(a = input_ids, b = add_0, cond = greater_equal_0)[name = string("select_0")];
+            int32 greater_equal_0_y_0_1 = const()[name = string("greater_equal_0_y_0_1"), val = int32(0)];
+            tensor<bool, [1]> greater_equal_0_1 = greater_equal(x = select_0, y = greater_equal_0_y_0_1)[name = string("greater_equal_0_1")];
+            int32 slice_by_index_0_1 = const()[name = string("slice_by_index_0_1"), val = int32(151936)];
+            tensor<int32, [1]> add_0_1 = add(x = select_0, y = slice_by_index_0_1)[name = string("add_0_1")];
+            tensor<int32, [1]> select_0_1 = select(a = select_0, b = add_0_1, cond = greater_equal_0_1)[name = string("select_0_1")];
+            int32 embeddings_cast_fp16_axis_0 = const()[name = string("embeddings_cast_fp16_axis_0"), val = int32(0)];
+            tensor<fp16, [1, 2048]> embeddings_cast_fp16 = gather(axis = embeddings_cast_fp16_axis_0, batch_dims = embeddings_batch_dims_0, indices = select_0_1, validate_indices = embeddings_validate_indices_0, x = text_embedding_weight_to_fp16_palettized)[name = string("embeddings_cast_fp16")];
+            tensor<int32, [1]> var_19_axes_0 = const()[name = string("op_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2048, 1]> var_19_cast_fp16 = expand_dims(axes = var_19_axes_0, x = embeddings_cast_fp16)[name = string("op_19_cast_fp16")];
+            tensor<int32, [1]> input_1_axes_0 = const()[name = string("input_1_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 2048, 1, 1]> input_1_cast_fp16 = expand_dims(axes = input_1_axes_0, x = var_19_cast_fp16)[name = string("input_1_cast_fp16")];
+            string input_3_pad_type_0 = const()[name = string("input_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> input_3_strides_0 = const()[name = string("input_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> input_3_pad_0 = const()[name = string("input_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> input_3_dilations_0 = const()[name = string("input_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 input_3_groups_0 = const()[name = string("input_3_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> text_projection_linear_fc1_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311165632))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(315360000))))[name = string("text_projection_linear_fc1_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> text_projection_linear_fc1_bias_to_fp16 = const()[name = string("text_projection_linear_fc1_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(315360576)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_3_cast_fp16 = conv(bias = text_projection_linear_fc1_bias_to_fp16, dilations = input_3_dilations_0, groups = input_3_groups_0, pad = input_3_pad_0, pad_type = input_3_pad_type_0, strides = input_3_strides_0, weight = text_projection_linear_fc1_weight_to_fp16_palettized, x = input_1_cast_fp16)[name = string("input_3_cast_fp16")];
+            tensor<fp16, [1, 2048, 1, 1]> input_cast_fp16 = silu(x = input_3_cast_fp16)[name = string("input_cast_fp16")];
+            string var_42_pad_type_0 = const()[name = string("op_42_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_42_strides_0 = const()[name = string("op_42_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_42_pad_0 = const()[name = string("op_42_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_42_dilations_0 = const()[name = string("op_42_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_42_groups_0 = const()[name = string("op_42_groups_0"), val = int32(1)];
+            tensor<fp16, [2048, 2048, 1, 1]> text_projection_linear_fc2_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint8, [2048, 2048, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(315364736))), lut = tensor<fp16, [1, 1, 1, 1, 256, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319559104))))[name = string("text_projection_linear_fc2_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> text_projection_linear_fc2_bias_to_fp16 = const()[name = string("text_projection_linear_fc2_bias_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319559680)))];
+            tensor<fp16, [1, 2048, 1, 1]> input_embeds = conv(bias = text_projection_linear_fc2_bias_to_fp16, dilations = var_42_dilations_0, groups = var_42_groups_0, pad = var_42_pad_0, pad_type = var_42_pad_type_0, strides = var_42_strides_0, weight = text_projection_linear_fc2_weight_to_fp16_palettized, x = input_cast_fp16)[name = string("op_42_cast_fp16")];
+        } -> (input_embeds);
+}
\ No newline at end of file
diff --git a/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/weights/weight.bin b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3e2683978bd4f185856ef574432154c6b132e917
--- /dev/null
+++ b/qwen3_tts/text_projector/12hz-1.7b-customvoice/W8A16/TextProjector.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dc53fb68c883e5d9809847760116ae2aaac8ff054f993df9419adfafd0ad4d2
+size 319563840